diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index b0b3a1020e618f..987034cae61e02 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -24,6 +24,5 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
liblttng-ust-dev \
libssl-dev \
libkrb5-dev \
- zlib1g-dev \
ninja-build \
tzdata
diff --git a/.devcontainer/android/Dockerfile b/.devcontainer/android/Dockerfile
index 092e291fc6290d..bdbc7d68f258cb 100644
--- a/.devcontainer/android/Dockerfile
+++ b/.devcontainer/android/Dockerfile
@@ -21,9 +21,7 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
liblttng-ust-dev \
libssl-dev \
libkrb5-dev \
- zlib1g-dev \
ninja-build \
- zlib1g-dev \
ninja-build \
openjdk-17-jdk \
pulseaudio
diff --git a/.devcontainer/wasm-multiThreaded/Dockerfile b/.devcontainer/wasm-multiThreaded/Dockerfile
index 70fc2380fdd098..ed0ee4f35f5d73 100644
--- a/.devcontainer/wasm-multiThreaded/Dockerfile
+++ b/.devcontainer/wasm-multiThreaded/Dockerfile
@@ -24,7 +24,6 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
liblttng-ust-dev \
libssl-dev \
libkrb5-dev \
- zlib1g-dev \
ninja-build
SHELL ["/bin/bash", "-c"]
diff --git a/.devcontainer/wasm/Dockerfile b/.devcontainer/wasm/Dockerfile
index 63335387f3cd1e..3950e19fcf1de4 100644
--- a/.devcontainer/wasm/Dockerfile
+++ b/.devcontainer/wasm/Dockerfile
@@ -23,7 +23,6 @@ RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
liblttng-ust-dev \
libssl-dev \
libkrb5-dev \
- zlib1g-dev \
ninja-build
SHELL ["/bin/bash", "-c"]
diff --git a/.github/workflows/check-no-merge-label.yml b/.github/workflows/check-no-merge-label.yml
new file mode 100644
index 00000000000000..1c01c2f7324175
--- /dev/null
+++ b/.github/workflows/check-no-merge-label.yml
@@ -0,0 +1,25 @@
+name: check-no-merge-label
+
+permissions:
+ pull-requests: read
+
+on:
+ pull_request:
+ types: [opened, edited, reopened, labeled, unlabeled, synchronize]
+ branches:
+ - 'main'
+ - 'release/**'
+
+jobs:
+ check-labels:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Check 'NO-MERGE' label
+ run: |
+ echo "Merging permission is disabled when the 'NO-MERGE' label is applied."
+ if [ "${{ contains(github.event.pull_request.labels.*.name, 'NO-MERGE') }}" = "false" ]; then
+ exit 0
+ else
+ echo "::error:: The 'NO-MERGE' label was applied to the PR. Merging is disabled."
+ exit 1
+ fi
diff --git a/.github/workflows/check-service-labels.yml b/.github/workflows/check-service-labels.yml
index 5261cc165ee128..2d85e4d278a393 100644
--- a/.github/workflows/check-service-labels.yml
+++ b/.github/workflows/check-service-labels.yml
@@ -15,7 +15,7 @@ jobs:
steps:
- name: Check 'Servicing-approved' label
run: |
- echo "Merging permission is enabled for servicing PRs when the `Servicing-approved` label is applied."
+ echo "Merging permission is enabled for servicing PRs when the 'Servicing-approved' label is applied."
if [ "${{ contains(github.event.pull_request.labels.*.name, 'Servicing-approved') }}" = "true" ]; then
exit 0
else
diff --git a/Directory.Build.props b/Directory.Build.props
index 7b78bd6efc0f6b..5eeb3ed7f86822 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -63,7 +63,7 @@
- src/mono/msbuild/apple/build/AppleBuild.targets
- src/installer/pkg/sfx/bundle/shared-framework-distribution-template-x64.xml
- src/installer/pkg/sfx/bundle/shared-framework-distribution-template-arm64.xml
- - src/tasks/AotCompilerTask/MonoAOTCompiler.props
+ - src/mono/msbuild/common/MonoAOTCompiler.props
- src/tasks/AppleAppBuilder/Xcode.cs
- src/tasks/MobileBuildTasks/Apple/AppleProject.cs
- https://github.com/dotnet/sdk repo > src/Installer/redist-installer/targets/GeneratePKG.targets
diff --git a/eng/SourceBuildPrebuiltBaseline.xml b/eng/SourceBuildPrebuiltBaseline.xml
index 4e5b0e471c1565..69be84119be4d9 100644
--- a/eng/SourceBuildPrebuiltBaseline.xml
+++ b/eng/SourceBuildPrebuiltBaseline.xml
@@ -16,6 +16,7 @@
+
-
+
https://github.com/dotnet/cecil
- 9e8bd520939ddfee686261267a1646c1b113d9e1
+ 2d5c8fb9aa8bd4c7fc085a73520061075c601655
@@ -77,146 +77,146 @@
-
+
https://github.com/dotnet/source-build-externals
- 591e522d15c8c9ffad7c7c1df1ae6a3d392717b4
+ 0c377e9585d2aeae504ff1d6529ccb1abef36172
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
https://github.com/dotnet/llvm-project
@@ -302,39 +302,39 @@
https://github.com/dotnet/llvm-project
317598aea216019b6164f599859c738f69595c60
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
-
+
https://github.com/dotnet/runtime
- 5c6d1b3f7b63a3150ce6c737aeb4af03b3cce621
+ 29013d8ae50f5bc35427a9155234ccebfa5e227c
https://github.com/dotnet/xharness
@@ -348,9 +348,9 @@
https://github.com/dotnet/xharness
3119edb6d70fb252e6128b0c7e45d3fc2f49f249
-
+
https://github.com/dotnet/arcade
- e7cb34898a1b610eb2a22591a2178da6f1fb7e3c
+ 3bb46f96cc988a80a414f45394f8a9ce54b47d3b
https://dev.azure.com/dnceng/internal/_git/dotnet-optimization
@@ -368,25 +368,25 @@
https://dev.azure.com/dnceng/internal/_git/dotnet-optimization
93bf80f30db2e15a7d62c22ff80fecf3518519b1
-
+
https://github.com/dotnet/hotreload-utils
- 7d2f352486b2e39a7829fc7fefa7d6cf825deff5
+ a8ba820e852306e5098dce560629cd98e0eb8a4a
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
-
+
https://github.com/dotnet/roslyn
- 557c46c532788c16881dbe1b9bd3d938c2ed22e0
+ 250065a15ef94895ef87f3e09b313b987375b5b1
-
+
https://github.com/dotnet/roslyn
- 557c46c532788c16881dbe1b9bd3d938c2ed22e0
+ 250065a15ef94895ef87f3e09b313b987375b5b1
-
+
https://github.com/dotnet/roslyn
- 557c46c532788c16881dbe1b9bd3d938c2ed22e0
+ 250065a15ef94895ef87f3e09b313b987375b5b1
https://github.com/dotnet/roslyn-analyzers
@@ -397,20 +397,14 @@
5ed336762c6260a83ece35cd1f6749251452bad0
-
+
https://github.com/dotnet/roslyn
- 557c46c532788c16881dbe1b9bd3d938c2ed22e0
+ 250065a15ef94895ef87f3e09b313b987375b5b1
-
+
https://github.com/dotnet/sdk
- 13330d5ded0b2b2bcd6459d6a410aa6220b11040
-
-
-
- https://github.com/dotnet/sdk
- 13330d5ded0b2b2bcd6459d6a410aa6220b11040
-
+ 23e2ba847d79562b972dbf54eca3f87c3044d925
https://dev.azure.com/dnceng/internal/_git/dotnet-optimization
@@ -470,9 +464,9 @@
https://github.com/dotnet/node
703264f70f553a06adfb330378c96f56b7583273
-
+
https://github.com/dotnet/runtime-assets
- 6082ed1bb2cfd2d394cdc0ec613c88f3754041f7
+ fc476e8f2d685eb7cadf6342393a0af2708f4dbf
diff --git a/eng/Versions.props b/eng/Versions.props
index 31865578e84bd8..861bd235583e0c 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -44,9 +44,9 @@
Any tools that contribute to the design-time experience should use the MicrosoftCodeAnalysisVersion_LatestVS property above to ensure
they do not break the local dev experience.
-->
- 4.14.0-1.25077.5
- 4.14.0-1.25077.5
- 4.14.0-1.25077.5
+ 4.14.0-2.25079.2
+ 4.14.0-2.25079.2
+ 4.14.0-2.25079.2
- 10.0.100-alpha.1.24622.2
+ 10.0.100-alpha.1.25077.2
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 2.9.2-beta.25058.4
- 10.0.0-beta.25058.4
- 2.9.2-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
- 10.0.0-beta.25058.4
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 2.9.2-beta.25079.2
+ 10.0.0-beta.25079.2
+ 2.9.2-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
+ 10.0.0-beta.25079.2
1.4.0
6.0.0-preview.1.102
- 10.0.0-alpha.1.25057.24
+ 10.0.0-alpha.1.25068.1
6.0.0
- 10.0.0-alpha.1.25057.24
+ 10.0.0-alpha.1.25068.1
6.0.0
6.0.0
@@ -123,46 +123,46 @@
5.0.0
4.6.0
4.6.0
- 10.0.0-alpha.1.25057.24
- 10.0.0-alpha.1.25057.24
+ 10.0.0-alpha.1.25068.1
+ 10.0.0-alpha.1.25068.1
6.0.0
5.0.0
5.0.0
5.0.0
7.0.0
- 10.0.0-alpha.1.25057.24
+ 10.0.0-alpha.1.25068.1
6.1.0
7.0.0
4.6.0
4.5.0
- 10.0.0-alpha.1.25057.24
+ 10.0.0-alpha.1.25068.1
8.0.0
8.0.4
8.0.0
8.0.0
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
- 10.0.0-beta.25060.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
+ 10.0.0-beta.25070.2
10.0.0-prerelease.24610.1
10.0.0-prerelease.24610.1
10.0.0-prerelease.24610.1
- 10.0.0-alpha.0.24627.1
+ 10.0.0-alpha.0.25070.1
1.0.0-prerelease.25067.2
1.0.0-prerelease.25067.2
@@ -219,9 +219,9 @@
9.0.0-preview-20241010.1
- 0.11.5-alpha.24627.1
+ 0.11.5-alpha.25069.2
- 10.0.0-alpha.1.24627.1
+ 10.0.0-preview.2.25074.1
2.4.3
9.0.0-alpha.1.24167.3
diff --git a/eng/common/internal/Tools.csproj b/eng/common/internal/Tools.csproj
index 32f79dfb3402c0..feaa6d20812d8f 100644
--- a/eng/common/internal/Tools.csproj
+++ b/eng/common/internal/Tools.csproj
@@ -15,16 +15,6 @@
-
-
-
- https://devdiv.pkgs.visualstudio.com/_packaging/dotnet-core-internal-tooling/nuget/v3/index.json;
-
-
- $(RestoreSources);
- https://devdiv.pkgs.visualstudio.com/_packaging/VS/nuget/v3/index.json;
-
-
diff --git a/eng/common/tools.ps1 b/eng/common/tools.ps1
index 04b02f4fd3cc86..80f9130b15087c 100644
--- a/eng/common/tools.ps1
+++ b/eng/common/tools.ps1
@@ -42,7 +42,7 @@
[bool]$useInstalledDotNetCli = if (Test-Path variable:useInstalledDotNetCli) { $useInstalledDotNetCli } else { $true }
# Enable repos to use a particular version of the on-line dotnet-install scripts.
-# default URL: https://dotnet.microsoft.com/download/dotnet/scripts/v1/dotnet-install.ps1
+# default URL: https://builds.dotnet.microsoft.com/dotnet/scripts/v1/dotnet-install.ps1
[string]$dotnetInstallScriptVersion = if (Test-Path variable:dotnetInstallScriptVersion) { $dotnetInstallScriptVersion } else { 'v1' }
# True to use global NuGet cache instead of restoring packages to repository-local directory.
@@ -262,7 +262,7 @@ function GetDotNetInstallScript([string] $dotnetRoot) {
if (!(Test-Path $installScript)) {
Create-Directory $dotnetRoot
$ProgressPreference = 'SilentlyContinue' # Don't display the console progress UI - it's a huge perf hit
- $uri = "https://dotnet.microsoft.com/download/dotnet/scripts/$dotnetInstallScriptVersion/dotnet-install.ps1"
+ $uri = "https://builds.dotnet.microsoft.com/dotnet/scripts/v1/dotnet-install.ps1"
Retry({
Write-Host "GET $uri"
diff --git a/eng/common/tools.sh b/eng/common/tools.sh
index 40485a0f59de16..df203b5178421d 100755
--- a/eng/common/tools.sh
+++ b/eng/common/tools.sh
@@ -54,7 +54,7 @@ warn_as_error=${warn_as_error:-true}
use_installed_dotnet_cli=${use_installed_dotnet_cli:-true}
# Enable repos to use a particular version of the on-line dotnet-install scripts.
-# default URL: https://dotnet.microsoft.com/download/dotnet/scripts/v1/dotnet-install.sh
+# default URL: https://builds.dotnet.microsoft.com/dotnet/scripts/v1/dotnet-install.sh
dotnetInstallScriptVersion=${dotnetInstallScriptVersion:-'v1'}
# True to use global NuGet cache instead of restoring packages to repository-local directory.
@@ -295,7 +295,7 @@ function with_retries {
function GetDotNetInstallScript {
local root=$1
local install_script="$root/dotnet-install.sh"
- local install_script_url="https://dotnet.microsoft.com/download/dotnet/scripts/$dotnetInstallScriptVersion/dotnet-install.sh"
+ local install_script_url="https://builds.dotnet.microsoft.com/dotnet/scripts/v1/dotnet-install.sh"
if [[ ! -a "$install_script" ]]; then
mkdir -p "$root"
diff --git a/eng/pipelines/common/xplat-setup.yml b/eng/pipelines/common/xplat-setup.yml
index 8bedc3fbce652c..fda4c66b4a791f 100644
--- a/eng/pipelines/common/xplat-setup.yml
+++ b/eng/pipelines/common/xplat-setup.yml
@@ -71,6 +71,8 @@ jobs:
value: zip
- name: tarCompression
value: ''
+ - name: exeExt
+ value: '.exe'
- name: scriptExt
value: '.cmd'
- name: dir
@@ -91,6 +93,8 @@ jobs:
value: tar
- name: tarCompression
value: gz
+ - name: exeExt
+ value: ''
- name: scriptExt
value: '.sh'
- name: dir
diff --git a/eng/pipelines/coreclr/templates/crossgen2-comparison-build-job.yml b/eng/pipelines/coreclr/templates/crossgen2-comparison-build-job.yml
index bf598beec35d35..01f9ed4cfb5309 100644
--- a/eng/pipelines/coreclr/templates/crossgen2-comparison-build-job.yml
+++ b/eng/pipelines/coreclr/templates/crossgen2-comparison-build-job.yml
@@ -69,10 +69,7 @@ jobs:
- name: target_crossgen2_os
value: osx
- name: crossgen2location
- value: $(productDirectory)$(dir)$(targetFlavor)$(dir)crossgen2$(dir)crossgen2.dll
- - ${{ if ne(parameters.archType, 'x64') }}:
- - name: crossgen2location
- value: $(productDirectory)$(dir)$(targetFlavor)$(dir)x64$(dir)crossgen2$(dir)crossgen2.dll
+ value: $(binDirectory)$(dir)crossgen2_inbuild$(dir)$(archType)$(dir)$(buildConfigUpper)$(dir)crossgen2.dll
- name: librariesProductDllDir
value: $(Build.SourcesDirectory)$(dir)artifacts$(dir)bin$(dir)runtime$(dir)net10.0-$(osGroup)$(osSubgroup)-$(buildConfig)-$(archType)
diff --git a/global.json b/global.json
index 20e56cc5204710..41f301cd8a0f98 100644
--- a/global.json
+++ b/global.json
@@ -1,18 +1,18 @@
{
"sdk": {
- "version": "10.0.100-alpha.1.24610.7",
+ "version": "10.0.100-alpha.1.25077.2",
"allowPrerelease": true,
"rollForward": "major"
},
"tools": {
- "dotnet": "10.0.100-alpha.1.24610.7"
+ "dotnet": "10.0.100-alpha.1.25077.2"
},
"msbuild-sdks": {
- "Microsoft.DotNet.Arcade.Sdk": "10.0.0-beta.25058.4",
- "Microsoft.DotNet.Helix.Sdk": "10.0.0-beta.25058.4",
- "Microsoft.DotNet.SharedFramework.Sdk": "10.0.0-beta.25058.4",
+ "Microsoft.DotNet.Arcade.Sdk": "10.0.0-beta.25079.2",
+ "Microsoft.DotNet.Helix.Sdk": "10.0.0-beta.25079.2",
+ "Microsoft.DotNet.SharedFramework.Sdk": "10.0.0-beta.25079.2",
"Microsoft.Build.NoTargets": "3.7.0",
"Microsoft.Build.Traversal": "3.4.0",
- "Microsoft.NET.Sdk.IL": "10.0.0-alpha.1.25057.24"
+ "Microsoft.NET.Sdk.IL": "10.0.0-alpha.1.25068.1"
}
}
diff --git a/src/coreclr/gc/unix/config.gc.h.in b/src/coreclr/gc/unix/config.gc.h.in
index 01cb767798fbcd..dfc38aea6b8470 100644
--- a/src/coreclr/gc/unix/config.gc.h.in
+++ b/src/coreclr/gc/unix/config.gc.h.in
@@ -6,6 +6,7 @@
#cmakedefine01 HAVE_SYS_TIME_H
#cmakedefine01 HAVE_SYS_MMAN_H
+#cmakedefine01 HAVE_SYS_MEMBARRIER_H
#cmakedefine01 HAVE_PTHREAD_THREADID_NP
#cmakedefine01 HAVE_PTHREAD_GETTHREADID_NP
#cmakedefine01 HAVE_VM_FLAGS_SUPERPAGE_SIZE_ANY
diff --git a/src/coreclr/gc/unix/configure.cmake b/src/coreclr/gc/unix/configure.cmake
index c3b301f58938f0..8d33b81a32f727 100644
--- a/src/coreclr/gc/unix/configure.cmake
+++ b/src/coreclr/gc/unix/configure.cmake
@@ -11,6 +11,7 @@ include(CheckLibraryExists)
check_include_files(sys/time.h HAVE_SYS_TIME_H)
check_include_files(sys/mman.h HAVE_SYS_MMAN_H)
check_include_files(pthread_np.h HAVE_PTHREAD_NP_H)
+check_include_files(sys/membarrier.h HAVE_SYS_MEMBARRIER_H)
check_function_exists(vm_allocate HAVE_VM_ALLOCATE)
check_function_exists(sysctlbyname HAVE_SYSCTLBYNAME)
diff --git a/src/coreclr/gc/unix/gcenv.unix.cpp b/src/coreclr/gc/unix/gcenv.unix.cpp
index 37ce5943a20ff8..43588c66eb015a 100644
--- a/src/coreclr/gc/unix/gcenv.unix.cpp
+++ b/src/coreclr/gc/unix/gcenv.unix.cpp
@@ -29,6 +29,14 @@
#include
#endif
+#ifdef __linux__
+#include
+#include
+#define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
+#elif HAVE_SYS_MEMBARRIER_H
+#include
+#endif
+
#include
#undef min
@@ -94,10 +102,6 @@ extern "C"
#include
#endif // __HAIKU__
-#ifdef __linux__
-#include // __NR_membarrier
-#endif
-
#if HAVE_PTHREAD_NP_H
#include
#endif
@@ -132,29 +136,9 @@ typedef cpuset_t cpu_set_t;
// The cached total number of CPUs that can be used in the OS.
static uint32_t g_totalCpuCount = 0;
-//
-// Helper membarrier function
-//
-#ifdef __NR_membarrier
-# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__)
-#else
-# define membarrier(...) -ENOSYS
-#endif
-
-enum membarrier_cmd
-{
- MEMBARRIER_CMD_QUERY = 0,
- MEMBARRIER_CMD_GLOBAL = (1 << 0),
- MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
- MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
- MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
- MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
- MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
- MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6)
-};
-
bool CanFlushUsingMembarrier()
{
+#if defined(__linux__) || HAVE_SYS_MEMBARRIER_H
#ifdef TARGET_ANDROID
// Avoid calling membarrier on older Android versions where membarrier
@@ -169,15 +153,16 @@ bool CanFlushUsingMembarrier()
// Starting with Linux kernel 4.14, process memory barriers can be generated
// using MEMBARRIER_CMD_PRIVATE_EXPEDITED.
- int mask = membarrier(MEMBARRIER_CMD_QUERY, 0);
+ int mask = membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
if (mask >= 0 &&
mask & MEMBARRIER_CMD_PRIVATE_EXPEDITED &&
// Register intent to use the private expedited command.
- membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0) == 0)
+ membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0) == 0)
{
return true;
}
+#endif
return false;
}
@@ -423,12 +408,15 @@ bool GCToOSInterface::CanGetCurrentProcessorNumber()
// Flush write buffers of processors that are executing threads of the current process
void GCToOSInterface::FlushProcessWriteBuffers()
{
+#if defined(__linux__) || HAVE_SYS_MEMBARRIER_H
if (s_flushUsingMemBarrier)
{
- int status = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
+ int status = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0, 0);
assert(status == 0 && "Failed to flush using membarrier");
}
- else if (g_helperPage != 0)
+ else
+#endif
+ if (g_helperPage != 0)
{
int status = pthread_mutex_lock(&g_flushProcessWriteBuffersMutex);
assert(status == 0 && "Failed to lock the flushProcessWriteBuffersMutex lock");
diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
index 9c215b0ec69918..8811349b8cdbb3 100644
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@@ -3490,13 +3490,15 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
GenTree* operand = tree->gtGetOp1();
// The src must be a register.
- if (tree->OperIs(GT_NEG) && operand->isContained())
+ if (tree->OperIs(GT_NEG, GT_NOT) && operand->isContained())
{
genTreeOps oper = operand->OperGet();
switch (oper)
{
case GT_MUL:
{
+ assert(tree->OperIs(GT_NEG));
+
ins = INS_mneg;
GenTree* op1 = tree->gtGetOp1();
GenTree* a = op1->gtGetOp1();
@@ -3510,7 +3512,7 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
case GT_RSH:
case GT_RSZ:
{
- assert(ins == INS_neg || ins == INS_negs);
+ assert(ins == INS_neg || ins == INS_negs || ins == INS_mvn);
assert(operand->gtGetOp2()->IsCnsIntOrI());
assert(operand->gtGetOp2()->isContained());
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 4f9b2c4f0184b2..a86a2ad5b36969 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -402,12 +402,13 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size,
else
{
// For section constant, the immediate will be relocatable
- GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags));
+ GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm,
+ INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
}
}
else
{
- GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags));
+ GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
}
}
regSet.verifyRegUsed(reg);
@@ -738,12 +739,10 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
{
GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
- regNumber operandReg = genConsumeReg(operand);
+ regNumber operandReg = genConsumeReg(operand);
+ instruction ins = genGetInsForOper(tree->OperGet(), targetType);
- inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
-
- instruction ins = genGetInsForOper(tree->OperGet(), targetType);
- inst_RV(ins, targetReg, targetType);
+ GetEmitter()->emitIns_BASE_R_R(ins, emitActualTypeSize(tree), targetReg, operandReg);
}
genProduceReg(tree);
@@ -1063,6 +1062,8 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
GenTree* op1 = treeNode->gtGetOp1();
GenTree* op2 = treeNode->gtGetOp2();
+ bool eligibleForNDD = false;
+
// Commutative operations can mark op1 as contained or reg-optional to generate "op reg, memop/immed"
if (!op1->isUsedFromReg())
{
@@ -1158,31 +1159,57 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
// reg3 = reg3 op reg2
else
{
- var_types op1Type = op1->TypeGet();
- inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
- regSet.verifyRegUsed(targetReg);
- gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
- dst = treeNode;
- src = op2;
+ // when reg3 != reg1 && reg3 != reg2, and NDD is available, we can use APX-EVEX.ND to optimize the codegen.
+ eligibleForNDD = emit->DoJitUseApxNDD(ins);
+ if (!eligibleForNDD)
+ {
+ var_types op1Type = op1->TypeGet();
+ inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
+ regSet.verifyRegUsed(targetReg);
+ gcInfo.gcMarkRegPtrVal(targetReg, op1Type);
+ dst = treeNode;
+ src = op2;
+ }
+ else
+ {
+ dst = op1;
+ src = op2;
+ }
}
+ // we can assume all the floating instructions are processed and returned above.
+ assert(!varTypeIsFloating(treeNode));
+
// try to use an inc or dec
- if (oper == GT_ADD && !varTypeIsFloating(treeNode) && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
+ if (oper == GT_ADD && src->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
{
if (src->IsIntegralConst(1))
{
- emit->emitIns_R(INS_inc, emitTypeSize(treeNode), targetReg);
+ emit->emitIns_BASE_R_R(INS_inc, emitTypeSize(treeNode), targetReg, dst->GetRegNum());
genProduceReg(treeNode);
return;
}
else if (src->IsIntegralConst(-1))
{
- emit->emitIns_R(INS_dec, emitTypeSize(treeNode), targetReg);
+ emit->emitIns_BASE_R_R(INS_dec, emitTypeSize(treeNode), targetReg, dst->GetRegNum());
genProduceReg(treeNode);
return;
}
}
- regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
+
+ regNumber r = REG_NA;
+ if (eligibleForNDD)
+ {
+ // operands should be already formatted above
+ assert(dst->isUsedFromReg());
+ assert(op1reg != targetReg);
+ assert(op2reg != targetReg);
+ r = emit->emitIns_BASE_R_R_RM(ins, emitTypeSize(treeNode), targetReg, treeNode, dst, src);
+ }
+ else
+ {
+ r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src);
+ }
noway_assert(r == targetReg);
if (treeNode->gtOverflowEx())
@@ -1295,10 +1322,7 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
}
assert(regOp->isUsedFromReg());
- // Setup targetReg when neither of the source operands was a matching register
- inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true);
-
- emit->emitInsBinary(ins, size, treeNode, rmOp);
+ emit->emitIns_BASE_R_R_RM(ins, size, mulTargetReg, treeNode, regOp, rmOp);
// Move the result to the desired register, if necessary
if (ins == INS_mulEAX)
@@ -4406,23 +4430,24 @@ void CodeGen::genCodeForLockAdd(GenTreeOp* node)
if (imm == 1)
{
// inc [addr]
- GetEmitter()->emitIns_AR(INS_inc, size, addr->GetRegNum(), 0);
+ GetEmitter()->emitIns_AR(INS_inc, size, addr->GetRegNum(), 0, INS_OPTS_EVEX_NoApxPromotion);
}
else if (imm == -1)
{
// dec [addr]
- GetEmitter()->emitIns_AR(INS_dec, size, addr->GetRegNum(), 0);
+ GetEmitter()->emitIns_AR(INS_dec, size, addr->GetRegNum(), 0, INS_OPTS_EVEX_NoApxPromotion);
}
else
{
// add [addr], imm
- GetEmitter()->emitIns_I_AR(INS_add, size, imm, addr->GetRegNum(), 0);
+ GetEmitter()->emitIns_I_AR(INS_add, size, imm, addr->GetRegNum(), 0, INS_OPTS_EVEX_NoApxPromotion);
}
}
else
{
// add [addr], data
- GetEmitter()->emitIns_AR_R(INS_add, size, data->GetRegNum(), addr->GetRegNum(), 0);
+ GetEmitter()->emitIns_AR_R(INS_add, size, data->GetRegNum(), addr->GetRegNum(), 0,
+ INS_OPTS_EVEX_NoApxPromotion);
}
}
@@ -4459,7 +4484,8 @@ void CodeGen::genLockedInstructions(GenTreeOp* node)
// or/and dword ptr [addrReg], val
//
instGen(INS_lock);
- GetEmitter()->emitIns_AR_R(ins, size, data->GetRegNum(), addr->GetRegNum(), 0);
+ GetEmitter()->emitIns_AR_R(ins, size, data->GetRegNum(), addr->GetRegNum(), 0,
+ INS_OPTS_EVEX_NoApxPromotion);
}
else
{
@@ -4842,11 +4868,10 @@ void CodeGen::genCodeForShift(GenTree* tree)
return;
}
#endif
- // First, move the operand to the destination register and
- // later on perform the shift in-place.
- // (LSRA will try to avoid this situation through preferencing.)
- inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true);
- inst_RV_SH(ins, size, tree->GetRegNum(), shiftByValue);
+ ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
+ GetEmitter()->emitIns_BASE_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue);
+ genProduceReg(tree);
+ return;
}
}
#if defined(TARGET_64BIT)
@@ -4887,8 +4912,7 @@ void CodeGen::genCodeForShift(GenTree* tree)
// The operand to be shifted must not be in ECX
noway_assert(operandReg != REG_RCX);
- inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true);
- inst_RV(ins, tree->GetRegNum(), targetType);
+ GetEmitter()->emitIns_BASE_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg);
}
genProduceReg(tree);
@@ -9237,6 +9261,96 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0);
theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0);
+
+ // APX-EVEX
+
+ theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_I(INS_rcr_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_I(INS_rcl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd);
+ theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd);
+
+ theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_rcr_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_I(INS_rcl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_R(INS_tzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_lzcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_popcnt_apx, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_S(INS_tzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_lzcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_popcnt_apx, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11,
+ (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd));
+
+ theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
+
+ theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
+ theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1);
+
+ theEmitter->emitIns_AR(INS_inc, EA_4BYTE, REG_EAX, 0, INS_OPTS_EVEX_NoApxPromotion);
+
+ theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12);
+ theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5);
}
void CodeGen::genAmd64EmitterUnitTestsAvx10v2()
@@ -11434,7 +11548,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
if (barrierKind == BARRIER_FULL)
{
instGen(INS_lock);
- GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0);
+ GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0, INS_OPTS_EVEX_NoApxPromotion);
}
}
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 33dda8c734ca1b..a1e84a95c89a48 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2299,6 +2299,7 @@ void Compiler::compSetProcessor()
if (canUseApxEncoding())
{
codeGen->GetEmitter()->SetUseRex2Encoding(true);
+ codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true);
}
}
#endif // TARGET_XARCH
@@ -4871,11 +4872,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
DoPhase(this, PHASE_COMPUTE_DOMINATORS, &Compiler::fgComputeDominators);
}
- // Drop back to just checking profile likelihoods.
- //
- activePhaseChecks &= ~PhaseChecks::CHECK_PROFILE;
- activePhaseChecks |= PhaseChecks::CHECK_LIKELIHOODS;
-
#ifdef DEBUG
fgDebugCheckLinks();
#endif
@@ -5156,11 +5152,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
DoPhase(this, PHASE_SWITCH_RECOGNITION, &Compiler::optSwitchRecognition);
}
- // Drop back to just checking profile likelihoods.
- //
- activePhaseChecks &= ~PhaseChecks::CHECK_PROFILE;
- activePhaseChecks |= PhaseChecks::CHECK_LIKELIHOODS;
-
#ifdef DEBUG
// Stash the current estimate of the function's size if necessary.
if (verbose && opts.OptimizationEnabled())
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 442dd8f17a5a24..df37e7ff272fde 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -4014,7 +4014,7 @@ class Compiler
// false: we can add new tracked variables.
// true: We cannot add new 'tracked' variable
- bool lvaTrackedFixed = false;
+ bool lvaTrackedFixed = false;
unsigned lvaCount; // total number of locals, which includes function arguments,
// special arguments, IL local variables, and JIT temporary variables
@@ -6310,8 +6310,7 @@ class Compiler
void fgPrintEdgeWeights();
#endif
PhaseStatus fgComputeBlockWeights();
- bool fgComputeMissingBlockWeights(weight_t* returnWeight);
- bool fgComputeCalledCount(weight_t returnWeight);
+ bool fgComputeMissingBlockWeights();
bool fgReorderBlocks(bool useProfile);
void fgDoReversePostOrderLayout();
@@ -6327,7 +6326,6 @@ class Compiler
BasicBlock** blockOrder;
BasicBlock** tempOrder;
unsigned numCandidateBlocks;
- unsigned currEHRegion;
#ifdef DEBUG
weight_t GetLayoutCost(unsigned startPos, unsigned endPos);
@@ -6342,7 +6340,7 @@ class Compiler
void AddNonFallthroughPreds(unsigned blockPos);
bool RunGreedyThreeOptPass(unsigned startPos, unsigned endPos);
- bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock);
+ bool RunThreeOptPass();
public:
ThreeOptLayout(Compiler* comp);
@@ -6925,7 +6923,7 @@ class Compiler
unsigned acdCount = 0;
// Get the index to use as part of the AddCodeDsc key for sharing throw blocks
- unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg);
+ unsigned bbThrowIndex(BasicBlock* blk, AcdKeyDesignator* dsg);
struct AddCodeDscKey
{
@@ -6933,7 +6931,7 @@ class Compiler
AddCodeDscKey(): acdKind(SCK_NONE), acdData(0) {}
AddCodeDscKey(SpecialCodeKind kind, BasicBlock* block, Compiler* comp);
AddCodeDscKey(AddCodeDsc* add);
-
+
static bool Equals(const AddCodeDscKey& x, const AddCodeDscKey& y)
{
return (x.acdData == y.acdData) && (x.acdKind == y.acdKind);
@@ -10014,10 +10012,10 @@ class Compiler
}
//------------------------------------------------------------------------
- // canUseRex2Encoding - Answer the question: Is Rex2 encoding supported on this target.
+ // canUseApxEncoding - Answer the question: Are APX encodings supported on this target.
//
// Returns:
- // `true` if Rex2 encoding is supported, `false` if not.
+ // `true` if APX encoding is supported, `false` if not.
//
bool canUseApxEncoding() const
{
@@ -10069,7 +10067,7 @@ class Compiler
bool DoJitStressRex2Encoding() const
{
#ifdef DEBUG
- if (JitConfig.JitStressRex2Encoding() && compOpportunisticallyDependsOn(InstructionSet_APX))
+ if (JitConfig.JitStressRex2Encoding())
{
// we should make sure EVEX is also stressed when REX2 is stressed, as we will need to guarantee EGPR
// functionality is properly turned on for every instructions when REX2 is stress.
@@ -10084,13 +10082,30 @@ class Compiler
// JitStressEvexEncoding- Answer the question: Is Evex stress knob set
//
// Returns:
- // `true` if user requests REX2 encoding.
+ // `true` if user requests EVEX encoding.
//
bool JitStressEvexEncoding() const
{
#ifdef DEBUG
return JitConfig.JitStressEvexEncoding() || JitConfig.JitStressRex2Encoding();
#endif // DEBUG
+ return false;
+ }
+
+ //------------------------------------------------------------------------
+ // DoJitStressPromotedEvexEncoding- Answer the question: Do we force promoted EVEX encoding.
+ //
+ // Returns:
+ // `true` if user requests promoted EVEX encoding.
+ //
+ bool DoJitStressPromotedEvexEncoding() const
+ {
+#ifdef DEBUG
+ if (JitConfig.JitStressPromotedEvexEncoding())
+ {
+ return true;
+ }
+#endif // DEBUG
return false;
}
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 02461633f3c547..e8854e78a2200d 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -471,6 +471,7 @@ class emitter
SetUseVEXEncoding(false);
SetUseEvexEncoding(false);
SetUseRex2Encoding(false);
+ SetUsePromotedEVEXEncoding(false);
#endif // TARGET_XARCH
emitDataSecCur = nullptr;
@@ -793,7 +794,19 @@ class emitter
// For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length.
// For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode.
// Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits.
- unsigned _idEvexbContext : 2;
+ unsigned _idCustom5 : 1;
+ unsigned _idCustom6 : 1;
+
+#define _idEvexbContext \
+ (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \
+ */
+#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
+#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */
+
+ // In certian cases, we do not allow instructions to be promoted to APX-EVEX.
+ // e.g. instructions like add/and/or/inc/dec can be used with LOCK prefix, but cannot be prefixed by LOCK and
+ // EVEX together.
+ unsigned _idNoApxEvexXPromotion : 1;
#endif // TARGET_XARCH
#ifdef TARGET_ARM64
@@ -826,8 +839,8 @@ class emitter
////////////////////////////////////////////////////////////////////////
// Space taken up to here:
- // x86: 48 bits
- // amd64: 48 bits
+ // x86: 49 bits
+ // amd64: 49 bits
// arm: 48 bits
// arm64: 55 bits
// loongarch64: 46 bits
@@ -845,7 +858,7 @@ class emitter
#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
#define ID_EXTRA_BITFIELD_BITS (14)
#elif defined(TARGET_XARCH)
-#define ID_EXTRA_BITFIELD_BITS (16)
+#define ID_EXTRA_BITFIELD_BITS (17)
#else
#error Unsupported or unset target architecture
#endif
@@ -879,8 +892,8 @@ class emitter
////////////////////////////////////////////////////////////////////////
// Space taken up to here (with/without prev offset, assuming host==target):
- // x86: 54/50 bits
- // amd64: 55/50 bits
+ // x86: 55/51 bits
+ // amd64: 56/51 bits
// arm: 54/50 bits
// arm64: 62/57 bits
// loongarch64: 53/48 bits
@@ -1657,38 +1670,17 @@ class emitter
#ifdef TARGET_XARCH
bool idIsEvexbContextSet() const
{
- return _idEvexbContext != 0;
+ return idGetEvexbContext() != 0;
}
void idSetEvexbContext(insOpts instOptions)
{
assert(!idIsEvexbContextSet());
+ assert(idGetEvexbContext() == 0);
+ unsigned value = static_cast(instOptions & INS_OPTS_EVEX_b_MASK);
- switch (instOptions & INS_OPTS_EVEX_b_MASK)
- {
- case INS_OPTS_EVEX_eb_er_rd:
- {
- _idEvexbContext = 1;
- break;
- }
-
- case INS_OPTS_EVEX_er_ru:
- {
- _idEvexbContext = 2;
- break;
- }
-
- case INS_OPTS_EVEX_er_rz:
- {
- _idEvexbContext = 3;
- break;
- }
-
- default:
- {
- unreached();
- }
- }
+ _idCustom5 = ((value >> 0) & 1);
+ _idCustom6 = ((value >> 1) & 1);
}
unsigned idGetEvexbContext() const
@@ -1728,6 +1720,39 @@ class emitter
assert(!idIsEvexZContextSet());
_idEvexZContext = 1;
}
+
+ bool idIsEvexNdContextSet() const
+ {
+ return _idEvexNdContext != 0;
+ }
+
+ void idSetEvexNdContext()
+ {
+ assert(!idIsEvexNdContextSet());
+ _idEvexNdContext = 1;
+ }
+
+ bool idIsEvexNfContextSet() const
+ {
+ return _idEvexNfContext != 0;
+ }
+
+ void idSetEvexNfContext()
+ {
+ assert(!idIsEvexNfContextSet());
+ _idEvexNfContext = 1;
+ }
+
+ bool idIsNoApxEvexPromotion() const
+ {
+ return _idNoApxEvexXPromotion != 0;
+ }
+
+ void idSetNoApxEvexPromotion()
+ {
+ assert(!idIsNoApxEvexPromotion());
+ _idNoApxEvexXPromotion = 1;
+ }
#endif
#ifdef TARGET_ARMARCH
@@ -2531,7 +2556,12 @@ class emitter
CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue);
#endif // FEATURE_MASKED_HW_INTRINSICS
#endif // FEATURE_SIMD
+
+#if defined(TARGET_XARCH)
+ regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg = REG_NA);
+#else
regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src);
+#endif
regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2);
void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem);
void emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem);
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 3fd34318d9ec11..10d344b22c02fa 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -12311,7 +12311,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
// If there are 2 GC vars in this instrDesc, get the 2nd variable
// that should be tracked.
- adr2 = emitComp->lvaFrameAddress(varNum2, &FPbased2, true);
+ adr2 = emitComp->lvaFrameAddress(varNum2, &FPbased2, FPbased);
ofs2Dist = EA_SIZE_IN_BYTES(size);
#ifdef DEBUG
assert(FPbased == FPbased2);
diff --git a/src/coreclr/jit/emitfmtsxarch.h b/src/coreclr/jit/emitfmtsxarch.h
index f893fce8d07eea..a94a7c1b3e7d5b 100644
--- a/src/coreclr/jit/emitfmtsxarch.h
+++ b/src/coreclr/jit/emitfmtsxarch.h
@@ -140,6 +140,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w
IF_DEF(RRD_RRD_CNS, IS_R1_RD|IS_R2_RD, SCNS) // read reg1, read reg2, const
IF_DEF(RWR_RRD_CNS, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, const
IF_DEF(RRW_RRD_CNS, IS_R1_RW|IS_R2_RD, SCNS) // r/w reg1, read reg2, const
+IF_DEF(RWR_RRD_SHF, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, shift
IF_DEF(RRD_RRD_RRD, IS_R1_RD|IS_R2_RD|IS_R3_RD, NONE) // read reg1, read reg2, read reg3
IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg1, read reg2, read reg3
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index ddec8af5e753f5..8b51ff52f181f8 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -236,6 +236,18 @@ bool emitter::HasRex2Encoding(instruction ins) const
return (flags & Encoding_REX2) != 0;
}
+bool emitter::HasApxNdd(instruction ins) const
+{
+ insFlags flags = CodeGenInterface::instInfo[ins];
+ return (flags & INS_Flags_Has_NDD) != 0;
+}
+
+bool emitter::HasApxNf(instruction ins) const
+{
+ insFlags flags = CodeGenInterface::instInfo[ins];
+ return (flags & INS_Flags_Has_NF) != 0;
+}
+
bool emitter::IsVexEncodableInstruction(instruction ins) const
{
if (!UseVEXEncoding())
@@ -293,6 +305,106 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const
return HasRex2Encoding(ins);
}
+//------------------------------------------------------------------------
+// IsApxNDDEncodableInstruction: Answer the question- does this instruction have apx ndd form.
+//
+// Arguments:
+// ins - The instruction to check.
+//
+// Returns:
+// `true` if ins has apx ndd form.
+//
+bool emitter::IsApxNDDEncodableInstruction(instruction ins) const
+{
+ if (!UsePromotedEVEXEncoding())
+ {
+ return false;
+ }
+
+ return HasApxNdd(ins);
+}
+
+//------------------------------------------------------------------------
+// IsApxNFEncodableInstruction: Answer the question - does this instruction have Evex.nf supported
+//
+// Arguments:
+// ins - The instruction to check.
+//
+// Returns:
+// `true` if ins is Evex.nf supported.
+//
+bool emitter::IsApxNFEncodableInstruction(instruction ins) const
+{
+ if (!UsePromotedEVEXEncoding())
+ {
+ return false;
+ }
+
+ return HasApxNf(ins);
+}
+
+//------------------------------------------------------------------------
+// IsApxExtendedEvexInstruction: Answer the question - does this instruction have apx extended evex form.
+//
+// Arguments:
+// ins - The instruction to check.
+//
+// Returns:
+// `true` if ins has apx extended evex form.
+//
+bool emitter::IsApxExtendedEvexInstruction(instruction ins) const
+{
+ if (!UsePromotedEVEXEncoding())
+ {
+ return false;
+ }
+
+ return HasApxNdd(ins) || HasApxNf(ins);
+}
+
+//------------------------------------------------------------------------
+// IsShiftInstruction: Answer the question- is this instruction a shift instruction.
+//
+// Arguments:
+// ins - The instruction to check.
+//
+// Returns:
+// `true` if ins is a shift instruction.
+//
+bool emitter::IsShiftInstruction(instruction ins) const
+{
+ switch (ins)
+ {
+ case INS_rcl_1:
+ case INS_rcr_1:
+ case INS_rol_1:
+ case INS_ror_1:
+ case INS_shl_1:
+ case INS_shr_1:
+ case INS_sar_1:
+
+ case INS_rcl:
+ case INS_rcr:
+ case INS_rol:
+ case INS_ror:
+ case INS_shl:
+ case INS_shr:
+ case INS_sar:
+
+ case INS_rcl_N:
+ case INS_rcr_N:
+ case INS_rol_N:
+ case INS_ror_N:
+ case INS_shl_N:
+ case INS_shr_N:
+ case INS_sar_N:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
//------------------------------------------------------------------------
// IsLegacyMap1: Answer the question- Is this instruction on legacy-map-1
//
@@ -324,7 +436,7 @@ bool emitter::IsLegacyMap1(code_t code) const
if ((code & 0xFF00FF00) == 0x0F000000)
{
- // 4-byte, need to check if PP is a prefix.
+ // 4-byte, need to check if PP is prefixs
BYTE prefix = (BYTE)((code & 0xFF0000) >> 16);
return ((prefix == 0xF2) || (prefix == 0xF3) || (prefix == 0x66));
}
@@ -647,6 +759,24 @@ bool emitter::IsRexW1EvexInstruction(instruction ins)
return false;
}
+//------------------------------------------------------------------------
+// DoJitUseApxNDD: Answer the question: does JIT use APX NDD feature on the given instruction?
+//
+// Arguments:
+// ins - instruction to test
+//
+// Return Value:
+// true if JIT allows APX NDD to be applied on the instructions.
+//
+bool emitter::DoJitUseApxNDD(instruction ins) const
+{
+#if !defined(TARGET_AMD64)
+ return false;
+#else
+ return JitConfig.EnableApxNDD() && IsApxNDDEncodableInstruction(ins);
+#endif
+}
+
#ifdef TARGET_64BIT
//------------------------------------------------------------------------
// AreUpperBitsZero: check if some previously emitted
@@ -1257,6 +1387,179 @@ insOpts emitter::GetEmbRoundingMode(uint8_t mode) const
}
}
+//------------------------------------------------------------------------
+// emitHandleGCrefRegs: Update GC ref related registers' liveness.
+//
+// Arguments:
+// dst - Destination buffer.
+// id - instruction descriptor to the GC ref instruction.
+//
+void emitter::emitHandleGCrefRegs(BYTE* dst, instrDesc* id)
+{
+ regNumber reg1 = id->idReg1(); // dst and src1
+ regNumber reg2 = id->idReg2(); // src2
+ switch (id->idInsFmt())
+ {
+ case IF_RRD_RRD:
+ break;
+
+ case IF_RWR_RRD:
+ {
+ if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0)
+ {
+ // We're relocating "this" in the prolog
+ assert(emitComp->lvaIsOriginalThisArg(0));
+ assert(emitComp->lvaTable[0].lvRegister);
+ assert(emitComp->lvaTable[0].GetRegNum() == reg1);
+
+ if (emitFullGCinfo)
+ {
+ emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true);
+ break;
+ }
+ else
+ {
+ /* If emitFullGCinfo==false, the we don't use any
+ regPtrDsc's and so explicitly note the location
+ of "this" in GCEncode.cpp
+ */
+ }
+ }
+
+ emitGCregLiveUpd(id->idGCref(), reg1, dst);
+ break;
+ }
+
+ case IF_RRW_RRD:
+ case IF_RWR_RRD_RRD:
+ {
+ regNumber targetReg = reg1; // dst
+
+ // if the instructions is encoded in NDD form,
+ // src registers will be the 2nd and 3rd register on id.
+ if (id->idInsFmt() == IF_RWR_RRD_RRD)
+ {
+ reg1 = id->idReg2(); // src1
+ reg2 = id->idReg3(); // src2
+ }
+
+ switch (id->idIns())
+ {
+ /*
+ This must be one of the following cases:
+
+ xor reg, reg to assign NULL
+
+ and r1 , r2 if (ptr1 && ptr2) ...
+ or r1 , r2 if (ptr1 || ptr2) ...
+
+ add r1 , r2 to compute a normal byref
+ sub r1 , r2 to compute a strange byref (VC only)
+
+ */
+ case INS_xor:
+ assert(reg1 == reg2);
+ emitGCregLiveUpd(id->idGCref(), targetReg, dst);
+ break;
+
+ case INS_or:
+ case INS_and:
+ emitGCregDeadUpd(targetReg, dst);
+ break;
+
+ case INS_add:
+ case INS_sub:
+ case INS_sub_hide:
+ assert(id->idGCref() == GCT_BYREF);
+
+#if 0
+#ifdef DEBUG
+ // Due to elided register moves, we can't have the following assert.
+ // For example, consider:
+ // t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
+ // /--* t85 byref
+ // * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
+ // Here, V01 is type `long` on entry, then is stored as a byref. But because
+ // the register allocator assigned the same register, no instruction was
+ // generated, and we only (currently) make gcref/byref changes in emitter GC info
+ // when an instruction is generated. We still generate correct GC info, as this
+ // instruction, if writing a GC ref even through reading a long, will go live here.
+ // These situations typically occur due to unsafe casting, such as with Span.
+
+ regMaskTP regMask;
+ regMask = genRegMask(reg1) | genRegMask(reg2);
+
+ // r1/r2 could have been a GCREF as GCREF + int=BYREF
+ // or BYREF+/-int=BYREF
+ assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) ||
+ ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)));
+#endif // DEBUG
+#endif // 0
+
+ // Mark r1 as holding a byref
+ emitGCregLiveUpd(GCT_BYREF, targetReg, dst);
+ break;
+
+ default:
+#ifdef DEBUG
+ emitDispIns(id, false, false, false);
+#endif
+ assert(!"unexpected GC reg update instruction");
+ }
+
+ break;
+ }
+
+ case IF_RRW_RRW:
+ {
+ // This must be "xchg reg1, reg2"
+ assert(id->idIns() == INS_xchg);
+
+ // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC
+ // register pointer mask.
+
+ GCtype gc1, gc2;
+
+ gc1 = emitRegGCtype(reg1);
+ gc2 = emitRegGCtype(reg2);
+
+ if (gc1 != gc2)
+ {
+ // Kill the GC-info about the GC registers
+
+ if (needsGC(gc1))
+ {
+ emitGCregDeadUpd(reg1, dst);
+ }
+
+ if (needsGC(gc2))
+ {
+ emitGCregDeadUpd(reg2, dst);
+ }
+
+ // Now, swap the info
+
+ if (needsGC(gc1))
+ {
+ emitGCregLiveUpd(gc1, reg2, dst);
+ }
+
+ if (needsGC(gc2))
+ {
+ emitGCregLiveUpd(gc2, reg1, dst);
+ }
+ }
+ break;
+ }
+
+ default:
+#ifdef DEBUG
+ emitDispIns(id, false, false, false);
+#endif
+ assert(!"unexpected GC ref instruction format");
+ }
+}
+
//------------------------------------------------------------------------
// encodeRegAsIval: Encodes a register as an ival for use by a SIMD instruction
//
@@ -1343,9 +1646,23 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
return true;
}
+ if (id->idIsEvexNfContextSet() && IsBMIInstruction(ins))
+ {
+ // Only a few BMI instructions shall be promoted to APX-EVEX due to NF feature.
+ // TODO-XArch-APX: convert the check into forms like Has* as above.
+ return true;
+ }
+
#if defined(DEBUG)
if (emitComp->DoJitStressEvexEncoding())
{
+ if (IsBMIInstruction(ins))
+ {
+ // The Encoding_EVEX on some BMI instructions is tagged due to APX,
+ // they cannot be stressed with JitStressEvexEncoding.
+ return false;
+ }
+
// Requires the EVEX encoding due to STRESS mode and no change in semantics
//
// Some instructions, like VCMPEQW return the value in a SIMD register for
@@ -1354,6 +1671,12 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const
// check above so we need to still return false here to preserve semantics.
return !HasKMaskRegisterDest(ins);
}
+
+ if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding())
+ {
+ // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX.
+ return (IsBMIInstruction(ins) && HasApxNf(ins));
+ }
#endif // DEBUG
if ((ins == INS_pslldq) || (ins == INS_psrldq))
@@ -1408,6 +1731,57 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
return false;
}
+//------------------------------------------------------------------------
+// TakesApxExtendedEvexPrefix: Checks if the instruction should be legacy-promoted-EVEX encoded.
+//
+// Arguments:
+// instruction -- processor instruction to check
+//
+// Return Value:
+// true if this instruction requires a legacy-promoted-EVEX prefix.
+//
+bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const
+{
+ // TODO-XArch-APX:
+ // Isolating legacy-promoted-EVEX case out from VEX/EVEX-promoted-EVEX,
+ // as the latter ones are relatively simple, providing EGPRs functionality,
+ instruction ins = id->idIns();
+ if (!IsApxExtendedEvexInstruction(ins))
+ {
+ return false;
+ }
+
+ if (IsSimdInstruction(ins))
+ {
+ // This check should reject any instruction not from legacy map-0 or 1.
+ return false;
+ }
+
+ if (id->idIsNoApxEvexPromotion())
+ {
+ return false;
+ }
+
+ if (id->idIsEvexNdContextSet())
+ {
+ return true;
+ }
+
+ if (id->idIsEvexNfContextSet())
+ {
+ return true;
+ }
+
+#if defined(DEBUG)
+ if (emitComp->DoJitStressPromotedEvexEncoding())
+ {
+ return true;
+ }
+#endif // DEBUG
+
+ return false;
+}
+
// Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section
// 2.6.
// Add base EVEX prefix without setting W, R, X, or B bits
@@ -1450,6 +1824,10 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
#define ZBIT_IN_BYTE_EVEX_PREFIX 0x0000008000000000ULL
#define uBIT_IN_BYTE_EVEX_PREFIX 0x0000040000000000ULL
+#define MAP4_IN_BYTE_EVEX_PREFIX 0x4000000000000ULL
+#define ND_BIT_IN_BYTE_EVEX_PREFIX 0x1000000000ULL
+#define NF_BIT_IN_BYTE_EVEX_PREFIX 0x400000000ULL
+#define EXTENDED_EVEX_PP_BITS 0x10000000000ULL
//------------------------------------------------------------------------
// AddEvexPrefix: Add default EVEX prefix with only LL' bits set.
//
@@ -1459,19 +1837,72 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const
// attr -- operand size
//
// Return Value:
-// encoded code with Evex prefix.
+// encoded code with EVEX prefix.
//
emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAttr attr)
{
// Only AVX512 instructions require EVEX prefix
- assert(IsEvexEncodableInstruction(id->idIns()));
+ // After APX, some instructions in legacy or VEX space will be promoted to EVEX.
+ instruction ins = id->idIns();
+ assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+
+ if (instrIsExtendedReg3opImul(ins))
+ {
+ // the only case imul(0x68) will need EVEX prefix is EVEX.NF feature enabled.
+ // imul(0x68) opcode comes with ModR/M.REG byte to indicate implicit register use,
+ // when it is using extended registers (>= REG_R8), it comes with built-in REX prefix,
+ // remove them first and add the counter part in EVEX.
+ code &= 0xFFFFFFFF;
+ }
// Shouldn't have already added EVEX prefix
assert(!hasEvexPrefix(code));
- assert((code & DEFAULT_BYTE_EVEX_PREFIX_MASK) == 0);
+ assert((code & DEFAULT_BYTE_EVEX_PREFIX_MASK) == 0);
+
+ code |= DEFAULT_BYTE_EVEX_PREFIX;
+
+ if (IsApxExtendedEvexInstruction(ins))
+ {
+ if (!HasEvexEncoding(ins))
+ {
+ // Legacy-promoted insutrcions are not labeled with Encoding_EVEX.
+ code |= MAP4_IN_BYTE_EVEX_PREFIX;
+ }
+
+ // TODO-XArch-APX:
+ // verify if it is actually safe to reuse the EVEX.ND with EVEX.B on instrDesc.
+ if (id->idIsEvexNdContextSet())
+ {
+ code |= ND_BIT_IN_BYTE_EVEX_PREFIX;
+ }
+
+ if (id->idIsEvexNfContextSet())
+ {
+ code |= NF_BIT_IN_BYTE_EVEX_PREFIX;
+ }
+
+ if (attr == EA_2BYTE)
+ {
+ code |= EXTENDED_EVEX_PP_BITS;
+ }
+
+ if (instrIsExtendedReg3opImul(ins))
+ {
+ // EVEX.R3
+ // TODO-XArch-APX:
+ // A few side notes: based on how JIT defined IMUL, we may need to extend
+ // the definition to `IMUL_31` to cover EGPRs. And it can be defined in a
+ // similar way that opcodes comes with built-in REX2 prefix, and convert
+ // it to EVEX when needed with some helper functions.
+ code &= 0xFF7FFFFFFFFFFFFFULL;
+ }
- code |= DEFAULT_BYTE_EVEX_PREFIX;
+ return code;
+ }
+
+ // No APX-NDD instructions should reach code below.
+ assert(!IsApxExtendedEvexInstruction(ins));
if (attr == EA_32BYTE)
{
@@ -2022,6 +2453,14 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code)
}
}
#ifdef TARGET_AMD64
+ else if (TakesApxExtendedEvexPrefix(id))
+ {
+ // If the instruction is not VEX/EVEX encodable, and has EVEX prefix,
+ // then it is legacy promoted EVEX.
+ assert(hasEvexPrefix(code));
+ assert(IsApxExtendedEvexInstruction(ins));
+ return emitter::code_t(code | 0x0000800000000000ULL);
+ }
else if (hasRex2Prefix(code))
{
return emitter::code_t(code | 0x000800000000ULL);
@@ -2060,13 +2499,18 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code)
return code & 0xFF7FFFFFFFFFFFULL;
}
}
-#ifdef TARGET_AMD64
+ else if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(hasEvexPrefix(code));
+ assert(IsApxExtendedEvexInstruction(ins));
+ // R-bit is added in bit-inverted form.
+ return code & 0xFF7FFFFFFFFFFFFFULL;
+ }
else if (TakesRex2Prefix(id))
{
assert(IsRex2EncodableInstruction(ins));
return code |= 0xD50400000000ULL; // REX2.B3
}
-#endif // TARGET_AMD64
return code | 0x4400000000ULL;
}
@@ -2096,13 +2540,18 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code)
return code & 0xFFBFFFFFFFFFFFULL;
}
}
-#ifdef TARGET_AMD64
+ else if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(hasEvexPrefix(code));
+ assert(IsApxExtendedEvexInstruction(ins));
+ // X-bit is added in bit-inverted form.
+ return code & 0xFFBFFFFFFFFFFFFFULL;
+ }
else if (TakesRex2Prefix(id))
{
assert(IsRex2EncodableInstruction(ins));
return code |= 0xD50200000000ULL; // REX2.B3
}
-#endif // TARGET_AMD64
return code | 0x4200000000ULL;
}
@@ -2132,13 +2581,17 @@ emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code)
return code & 0xFFDFFFFFFFFFFFULL;
}
}
-#ifdef TARGET_AMD64
+ else if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(IsApxExtendedEvexInstruction(ins));
+ // R-bit is added in bit-inverted form.
+ return code & 0xFFDFFFFFFFFFFFFFULL;
+ }
else if (TakesRex2Prefix(id))
{
assert(IsRex2EncodableInstruction(ins));
return code |= 0xD50100000000ULL; // REX2.B3
}
-#endif // TARGET_AMD64
return code | 0x4100000000ULL;
}
@@ -2221,7 +2674,7 @@ bool isPrefix(BYTE b)
//
emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) const
{
- assert(IsEvexEncodableInstruction(ins));
+ assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
code &= 0x00000000FFFFFFFFLL;
@@ -2253,6 +2706,14 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
case 0x66:
{
// None of the existing BMI instructions should be EVEX encoded.
+ // After APX, BMI instructions can be EVEX encoded with NF feature.
+ if (IsBMIInstruction(ins))
+ {
+ // if BMI instructions reaches this part, then it should be APX-EVEX.
+ // although the opcode of all the BMI instructions are defined with 0x66,
+ // but it should not, skip this check.
+ break;
+ }
assert(!IsBMIInstruction(ins));
evexPrefix |= (0x01 << 8);
break;
@@ -2306,9 +2767,14 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
// 2-byte opcode with the bytes ordered as 0x0011RM22. There are 2 posibilities here:
// 1. the byte in position 11 must be an escape byte.
// 2. the byte in position 11 must be a map number from 0 to 7.
+
+ // APX promoted EVEX instructions might also go onto this path, so the opcode can also be 1-byte in the form of
+ // 0x0000RM11.
leadingBytes = (code >> 16) & 0xFF;
- assert(leadingBytes == 0x0F || (emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) &&
- leadingBytes >= 0x00 && leadingBytes <= 0x07));
+ assert(leadingBytes == 0x0F ||
+ (emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 &&
+ leadingBytes <= 0x07) ||
+ (IsApxExtendedEvexInstruction(ins) && leadingBytes == 0));
code &= 0xFFFF;
}
@@ -2330,6 +2796,12 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
case 0x0F:
{
+ if (((evexPrefix >> 16) & 0x07) == 0x04)
+ {
+ // MAP index equal to 4 indicates this instruction is a promoted legacy instruction.
+ // the MAP ID has been set when EVEX prefix is added.
+ break;
+ }
evexPrefix |= (0x01 << 16);
break;
}
@@ -2803,6 +3275,11 @@ unsigned emitter::emitGetRexPrefixSize(instrDesc* id, instruction ins)
return 0;
}
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ return 0;
+ }
+
if (TakesRex2Prefix(id))
{
return 0;
@@ -2913,10 +3390,20 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
adjustedSize++;
}
#ifdef TARGET_AMD64
- else if (IsRex2EncodableInstruction(ins))
+ else if (IsRex2EncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins))
{
unsigned prefixAdjustedSize = 0;
- if (TakesRex2Prefix(id))
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ prefixAdjustedSize = 4;
+ // If the opcode will be prefixed by EVEX, then all the map-1-legacy instructions can remove the escape
+ // prefix
+ if (IsLegacyMap1(code))
+ {
+ prefixAdjustedSize -= 1;
+ }
+ }
+ else if (TakesRex2Prefix(id))
{
prefixAdjustedSize = 2;
// If the opcode will be prefixed by REX2, then all the map-1-legacy instructions can remove the escape
@@ -2927,15 +3414,14 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const
}
}
- adjustedSize = prefixAdjustedSize;
-
emitAttr attr = id->idOpSize();
-
- if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
+ if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx) && !TakesApxExtendedEvexPrefix(id))
{
// Most 16-bit operand instructions will need a 0x66 prefix.
- adjustedSize++;
+ prefixAdjustedSize++;
}
+
+ adjustedSize = prefixAdjustedSize;
}
#endif // TARGET_AMD64
else
@@ -2987,6 +3473,14 @@ unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexP
if (includeRexPrefixSize && hasRexPrefix(code))
{
+ if (instrIsExtendedReg3opImul(id->idIns()) && TakesApxExtendedEvexPrefix(id))
+ {
+ // there is a special case when calculating the size of IMUL with APX-EVEX,
+ // IMUL_08 or beyond will have a built-in REX prefix with its opcode,
+ // so it will hit this branch, but when IMUL is encoded with APX-EVEX,
+ // the size of REX is included in the prefix size, where should be calculated outside.
+ return 0;
+ }
return 1;
}
@@ -3628,7 +4122,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi
{
// We are assuming that we only use/encode SPL, BPL, SIL and DIL
// not the corresponding AH, CH, DH, or BH
- *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX
+ *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX
}
#endif // TARGET_AMD64
@@ -3668,7 +4162,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi
}
if (false /*reg >= REG_R16 && reg <= REG_R31*/)
{
- // seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`.
+ // Seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`.
assert(TakesRex2Prefix(id));
*code |= 0x004000000000ULL; // REX2.R4
}
@@ -3677,7 +4171,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi
{
// We are assuming that we only use/encode SPL, BPL, SIL and DIL
// not the corresponding AH, CH, DH, or BH
- *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX
+ *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX
}
#endif // TARGET_AMD64
@@ -3697,7 +4191,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber
instruction ins = id->idIns();
assert(reg < REG_STK);
- assert(IsVexOrEvexEncodableInstruction(ins));
+ assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
assert(hasVexOrEvexPrefix(code));
// Get 4-bit register encoding
@@ -3744,6 +4238,25 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber
return code ^ regBits;
}
}
+ else
+ {
+ assert(TakesApxExtendedEvexPrefix(id));
+ assert(hasEvexPrefix(code));
+#if defined(TARGET_AMD64)
+ // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case.
+ // Rather see these paths cleaned up.
+ regBits = HighAwareRegEncoding(reg);
+
+ if (false /*reg >= REG_R16 && reg <= REG_R31*/)
+ {
+ // Have to set the EVEX V' bit
+ code = AddEvexVPrimePrefix(code);
+ }
+#endif
+ // Shift count = 5-bytes of opcode + 0-2 bits for EVEX
+ regBits <<= 43;
+ return code ^ regBits;
+ }
return code ^ regBits;
}
@@ -3779,7 +4292,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod
}
if (false /*reg >= REG_R16 && reg <= REG_R31*/)
{
- // seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`.
+ // Separate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`.
assert(TakesRex2Prefix(id));
*code |= 0x002000000000ULL; // REX2.X4
}
@@ -4175,7 +4688,8 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id)
if ((code & 0xFF00) != 0)
{
- sz += IsSimdInstruction(ins) ? emitInsSize(id, code, includeRexPrefixSize) : 5;
+ sz += (IsSimdInstruction(ins) || TakesApxExtendedEvexPrefix(id)) ? emitInsSize(id, code, includeRexPrefixSize)
+ : 5;
}
else
{
@@ -4303,7 +4817,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
assert(emitComp->lvaTempsHaveLargerOffsetThanVars());
// Check whether we can use compressed displacement if EVEX.
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
bool compressedFitsInByte = false;
TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte);
@@ -4347,7 +4861,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code,
#endif // !FEATURE_FIXED_OUT_ARGS
bool useSmallEncoding = false;
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding);
}
@@ -4514,7 +5028,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
}
else
{
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
}
@@ -5459,17 +5973,37 @@ void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommo
// attr - the instruction operand size
// dst - the destination and first source operand
// src - the second source operand
+// targetReg - target register of this binary node (only used for APX-NDD form)
//
// Assumptions:
// i) caller of this routine needs to call genConsumeReg()
// ii) caller of this routine needs to call genProduceReg()
-regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src)
+regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg)
{
// We can only have one memory operand and only src can be a constant operand
// However, the handling for a given operand type (mem, cns, or other) is fairly
// consistent regardless of whether they are src or dst. As such, we will find
// the type of each operand and only check them against src/dst where relevant.
+ const bool useNDD = UsePromotedEVEXEncoding() && (targetReg != REG_NA);
+#if !defined(TARGET_AMD64)
+ // APX does not support 32-bit system.
+ assert(!useNDD);
+#else
+ if (useNDD)
+ {
+ assert(IsApxNDDEncodableInstruction(ins));
+ // targetReg has to be an actual register if using NDD.
+ assert(targetReg < REG_STK);
+ // make sure target register is not either of the src registers.
+ assert(dst->isUsedFromReg());
+ regNumber dstreg = dst->GetRegNum();
+ regNumber srcreg = src->isUsedFromReg() ? src->GetRegNum() : REG_NA;
+ assert(targetReg != dstreg);
+ assert(targetReg != srcreg);
+ }
+#endif
+
GenTree* memOp = nullptr;
GenTree* cnsOp = nullptr;
GenTree* otherOp = nullptr;
@@ -5481,6 +6015,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
assert(dst->isUsedFromMemory() || (dst->GetRegNum() == REG_NA) || instrIs3opImul(ins));
assert(!src->isUsedFromMemory());
+ // APX code cannot hit this path.
+ assert(!useNDD);
+
memOp = dst;
if (src->isContained())
@@ -5588,6 +6125,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
assert(otherOp == nullptr);
assert(src->IsCnsIntOrI());
+ // APX code cannot hit this path.
+ assert(!useNDD);
+
id = emitNewInstrAmdCns(attr, memIndir->Offset(), (int)src->AsIntConCommon()->IconValue());
}
else
@@ -5605,6 +6145,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
assert(id != nullptr);
id->idIns(ins); // Set the instruction.
+ if (useNDD)
+ {
+ assert(memOp == src);
+ id->idReg1(targetReg);
+ id->idReg2(dst->GetRegNum());
+ id->idSetEvexNdContext();
+ }
// Determine the instruction format
insFormat fmt = IF_NONE;
@@ -5620,12 +6167,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
}
else
{
- fmt = emitInsModeFormat(ins, IF_RRD_ARD);
+ fmt = useNDD ? emitInsModeFormat(ins, IF_RWR_RRD_ARD) : emitInsModeFormat(ins, IF_RRD_ARD);
}
}
else
{
assert(memOp == dst);
+ assert(!useNDD);
if (cnsOp != nullptr)
{
@@ -5664,6 +6212,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
else
{
assert(memOp == dst);
+ assert(!useNDD);
if (cnsOp != nullptr)
{
@@ -5686,7 +6235,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
dispIns(id);
emitCurIGsize += sz;
- return (memOp == src) ? dst->GetRegNum() : REG_NA;
+ return (memOp == src) ? (useNDD ? targetReg : dst->GetRegNum()) : REG_NA;
}
}
}
@@ -5734,15 +6283,24 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
}
else
{
- // src is a stack based local variable
- // dst is a register
- emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset);
+ if (useNDD)
+ {
+ emitIns_R_R_S(ins, attr, targetReg, dst->GetRegNum(), varNum, offset, INS_OPTS_EVEX_nd);
+ return targetReg;
+ }
+ else
+ {
+ // src is a stack based local variable
+ // dst is a register
+ emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset);
+ }
}
}
else
{
assert(memOp == dst);
assert((dst->GetRegNum() == REG_NA) || dst->IsRegOptional());
+ assert(!useNDD);
if (cnsOp != nullptr)
{
@@ -5774,10 +6332,20 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
{
assert(!dst->isContained());
GenTreeIntConCommon* intCns = src->AsIntConCommon();
- emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue());
+
+ if (useNDD)
+ {
+ emitIns_R_R_I(ins, attr, targetReg, dst->GetRegNum(), (int)intCns->IconValue(), INS_OPTS_EVEX_nd);
+ return targetReg;
+ }
+ else
+ {
+ emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue());
+ }
}
else
{
+ assert(!useNDD);
assert(src->IsCnsFltOrDbl());
GenTreeDblCon* dblCns = src->AsDblCon();
@@ -5796,7 +6364,15 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
}
else
{
- emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum());
+ if (useNDD)
+ {
+ emitIns_R_R_R(ins, attr, targetReg, dst->GetRegNum(), src->GetRegNum(), INS_OPTS_EVEX_nd);
+ return targetReg;
+ }
+ else
+ {
+ emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum());
+ }
}
}
@@ -5947,7 +6523,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI
* Add an instruction referencing a single register.
*/
-void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
+void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions /* = INS_OPTS_NONE */)
{
emitAttr size = EA_SIZE(attr);
@@ -6023,6 +6599,8 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
id->idInsFmt(fmt);
id->idReg1(reg);
+ SetEvexNfIfNeeded(id, instOptions);
+
// Vex bytes
sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins)));
@@ -6095,10 +6673,11 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu
* Add an instruction referencing a register and a constant.
*/
-void emitter::emitIns_R_I(instruction ins,
- emitAttr attr,
- regNumber reg,
- ssize_t val DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags))
+void emitter::emitIns_R_I(instruction ins,
+ emitAttr attr,
+ regNumber reg,
+ ssize_t val,
+ insOpts instOptions DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags))
{
emitAttr size = EA_SIZE(attr);
@@ -6238,6 +6817,8 @@ void emitter::emitIns_R_I(instruction ins,
id->idDebugOnlyInfo()->idMemCookie = targetHandle;
#endif
+ SetEvexNfIfNeeded(id, instOptions);
+
if (isSimdInsAndValInByte)
{
bool includeRexPrefixSize = true;
@@ -6251,8 +6832,14 @@ void emitter::emitIns_R_I(instruction ins,
sz += emitInsSize(id, insCodeMI(ins), includeRexPrefixSize);
}
-
sz += emitGetAdjustedSize(id, insCodeMI(ins));
+#ifdef TARGET_AMD64
+ if (reg == REG_EAX && !instrIs3opImul(ins) && TakesApxExtendedEvexPrefix(id))
+ {
+ // ACC form is not promoted into EVEX space, need to emit with MI form.
+ sz += 1;
+ }
+#endif // TARGET_AMD64
// Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a
// 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target
@@ -7026,6 +7613,14 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum
id->idReg1(reg1);
id->idReg2(reg2);
+ SetEvexNdIfNeeded(id, instOptions);
+ SetEvexNfIfNeeded(id, instOptions);
+
+ if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+ {
+ id->idInsFmt(IF_RWR_RRD);
+ }
+
if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE)
{
// if EVEX.b needs to be set in this path, then it should be embedded rounding.
@@ -7079,6 +7674,32 @@ void emitter::emitIns_R_R_I(
assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0);
SetEvexEmbMaskIfNeeded(id, instOptions);
+ SetEvexNdIfNeeded(id, instOptions);
+
+ if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+ {
+ // need to fix the instruction opcode for legacy instructions as they has different opcode for RI form.
+ code = insCodeMI(ins);
+ // need to fix the instructions format for NDD legacy instructions.
+ insFormat fmt;
+ switch (ins)
+ {
+ case INS_shl_N:
+ case INS_shr_N:
+ case INS_sar_N:
+ case INS_ror_N:
+ case INS_rol_N:
+ case INS_rcr_N:
+ case INS_rcl_N:
+ fmt = IF_RWR_RRD_SHF;
+ break;
+
+ default:
+ fmt = IF_RWR_RRD_CNS;
+ break;
+ }
+ id->idInsFmt(fmt);
+ }
UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival);
id->idCodeSize(sz);
@@ -7087,7 +7708,7 @@ void emitter::emitIns_R_R_I(
emitCurIGsize += sz;
}
-void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs)
+void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs, insOpts instOptions)
{
assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta ||
ins == INS_inc || ins == INS_dec);
@@ -7100,6 +7721,11 @@ void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int off
id->idAddr()->iiaAddrMode.amBaseReg = base;
id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+ if ((instOptions & INS_OPTS_EVEX_NoApxPromotion) != 0)
+ {
+ id->idSetNoApxEvexPromotion();
+ }
+
UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
id->idCodeSize(sz);
@@ -7443,8 +8069,8 @@ void emitter::emitIns_R_R_C(instruction ins,
void emitter::emitIns_R_R_R(
instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, insOpts instOptions)
{
- assert(IsSimdInstruction(ins));
- assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
+ assert(IsSimdInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins));
instrDesc* id = emitNewInstr(attr);
id->idIns(ins);
@@ -7460,6 +8086,14 @@ void emitter::emitIns_R_R_R(
id->idSetEvexbContext(instOptions);
}
SetEvexEmbMaskIfNeeded(id, instOptions);
+ SetEvexNdIfNeeded(id, instOptions);
+ SetEvexNfIfNeeded(id, instOptions);
+
+ if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+ {
+ // need to fix the instructions format for NDD legacy instructions.
+ id->idInsFmt(IF_RWR_RRD_RRD);
+ }
UNATIVE_OFFSET sz = emitInsSizeRR(id, insCodeRM(ins));
id->idCodeSize(sz);
@@ -7471,8 +8105,8 @@ void emitter::emitIns_R_R_R(
void emitter::emitIns_R_R_S(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions)
{
- assert(IsSimdInstruction(ins));
- assert(IsThreeOperandAVXInstruction(ins));
+ assert(IsSimdInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins) || IsApxExtendedEvexInstruction(ins));
instrDesc* id = emitNewInstr(attr);
@@ -7484,6 +8118,12 @@ void emitter::emitIns_R_R_S(
SetEvexBroadcastIfNeeded(id, instOptions);
SetEvexEmbMaskIfNeeded(id, instOptions);
+ SetEvexNdIfNeeded(id, instOptions);
+
+ if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+ {
+ id->idInsFmt(IF_RWR_RRD_SRD);
+ }
#ifdef DEBUG
id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
@@ -8156,7 +8796,7 @@ void emitter::emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNu
* The following adds instructions referencing address modes.
*/
-void emitter::emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int disp)
+void emitter::emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int disp, insOpts instOptions)
{
assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
@@ -8202,6 +8842,10 @@ void emitter::emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber re
id->idAddr()->iiaAddrMode.amBaseReg = reg;
id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+ if ((instOptions & INS_OPTS_EVEX_NoApxPromotion) != 0)
+ {
+ id->idSetNoApxEvexPromotion();
+ }
assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
@@ -8310,9 +8954,10 @@ void emitter::emitIns_R_AI(instruction ins,
emitCurIGsize += sz;
}
-void emitter::emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp)
+void emitter::emitIns_AR_R(
+ instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp, insOpts instOptions)
{
- emitIns_ARX_R(ins, attr, reg, base, REG_NA, 1, disp);
+ emitIns_ARX_R(ins, attr, reg, base, REG_NA, 1, disp, instOptions);
}
//------------------------------------------------------------------------
@@ -8595,8 +9240,14 @@ void emitter::emitIns_R_ARX(
emitCurIGsize += sz;
}
-void emitter::emitIns_ARX_R(
- instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, cnsval_ssize_t disp)
+void emitter::emitIns_ARX_R(instruction ins,
+ emitAttr attr,
+ regNumber reg,
+ regNumber base,
+ regNumber index,
+ unsigned scale,
+ cnsval_ssize_t disp,
+ insOpts instOptions)
{
UNATIVE_OFFSET sz;
instrDesc* id = emitNewInstrAmd(attr, disp);
@@ -8622,6 +9273,10 @@ void emitter::emitIns_ARX_R(
id->idAddr()->iiaAddrMode.amBaseReg = base;
id->idAddr()->iiaAddrMode.amIndxReg = index;
id->idAddr()->iiaAddrMode.amScale = emitEncodeScale(scale);
+ if ((instOptions & INS_OPTS_EVEX_NoApxPromotion) != 0)
+ {
+ id->idSetNoApxEvexPromotion();
+ }
assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
@@ -9606,6 +10261,74 @@ void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs)
emitAdjustStackDepthPushPop(ins);
}
+void emitter::emitIns_BASE_R_R(instruction ins, emitAttr attr, regNumber op1Reg, regNumber op2Reg)
+{
+ if (DoJitUseApxNDD(ins) && (op1Reg != op2Reg))
+ {
+ // If APX-EVEX-NDD is available and needed, emit instructions in:
+ // ins dst, src
+ emitIns_R_R(ins, attr, op1Reg, op2Reg, INS_OPTS_EVEX_nd);
+ }
+ else
+ {
+ // mov dst, src
+ // ins dst
+ emitIns_Mov(INS_mov, attr, op1Reg, op2Reg, /*canSkip*/ true);
+ emitIns_R(ins, attr, op1Reg);
+ }
+}
+
+void emitter::emitIns_BASE_R_R_I(instruction ins, emitAttr attr, regNumber op1Reg, regNumber op2Reg, int ival)
+{
+ if (DoJitUseApxNDD(ins) && (op1Reg != op2Reg))
+ {
+ // If APX-EVEX-NDD is available and needed, emit instructions in:
+ // ins dst, src, cns
+ if (IsShiftInstruction(ins) && ival == 1)
+ {
+ emitIns_R_R(ins, attr, op1Reg, op2Reg, INS_OPTS_EVEX_nd);
+ }
+ else
+ {
+ emitIns_R_R_I(ins, attr, op1Reg, op2Reg, ival, INS_OPTS_EVEX_nd);
+ }
+ }
+ else
+ {
+ // mov dst, src
+ // ins dst, cns
+ emitIns_Mov(INS_mov, attr, op1Reg, op2Reg, /*canSkip*/ true);
+ if (IsShiftInstruction(ins) && ival == 1)
+ {
+ emitIns_R(ins, attr, op1Reg);
+ }
+ else
+ {
+ emitIns_R_I(ins, attr, op1Reg, ival);
+ }
+ }
+}
+
+regNumber emitter::emitIns_BASE_R_R_RM(
+ instruction ins, emitAttr attr, regNumber targetReg, GenTree* treeNode, GenTree* regOp, GenTree* rmOp)
+{
+ bool requiresOverflowCheck = treeNode->gtOverflowEx();
+ regNumber r = REG_NA;
+ assert(regOp->isUsedFromReg());
+
+ if (DoJitUseApxNDD(ins) && regOp->GetRegNum() != targetReg)
+ {
+ r = emitInsBinary(ins, attr, regOp, rmOp, targetReg);
+ }
+ else
+ {
+ emitIns_Mov(INS_mov, attr, targetReg, regOp->GetRegNum(), /*canSkip*/ true);
+ r = emitInsBinary(ins, attr, treeNode, rmOp);
+ }
+
+ return r;
+}
+
//----------------------------------------------------------------------------------------
// IsRedundantStackMov:
// Check if the current `mov` instruction is redundant and can be omitted when dealing with Load/Store from stack.
@@ -9743,6 +10466,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int va
SetEvexBroadcastIfNeeded(id, instOptions);
SetEvexEmbMaskIfNeeded(id, instOptions);
+ SetEvexNfIfNeeded(id, instOptions);
UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
id->idCodeSize(sz);
@@ -11447,6 +12171,13 @@ void emitter::emitDispEmbRounding(instrDesc* id) const
{
return;
}
+
+ if (IsApxExtendedEvexInstruction(id->idIns()))
+ {
+ // Apx-Evex.nd shared the same bit(s) with Evex.b,
+ // for ndd case, we don't need to display any thing special.
+ return;
+ }
assert(!id->idHasMem());
unsigned roundingMode = id->idGetEvexbContext();
if (roundingMode == 1)
@@ -11627,6 +12358,14 @@ void emitter::emitDispIns(
/* Display the instruction name */
+#ifdef TARGET_AMD64
+ if (IsApxNFEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet())
+ {
+ // print the EVEX.NF indication in psudeo prefix style.
+ printf("{nf} ");
+ }
+#endif // TARGET_AMD64
+
sstr = codeGen->genInsDisplayName(id);
printf(" %-9s", sstr);
@@ -12377,6 +13116,20 @@ void emitter::emitDispIns(
break;
}
+ case INS_rol:
+ case INS_ror:
+ case INS_rcl:
+ case INS_rcr:
+ case INS_shl:
+ case INS_shr:
+ case INS_sar:
+ {
+ printf("%s", emitRegName(id->idReg1(), attr));
+ printf(", %s", emitRegName(id->idReg2(), attr));
+ emitDispShift(ins, (BYTE)0);
+ break;
+ }
+
default:
{
printf("%s", emitRegName(id->idReg1(), attr));
@@ -12394,8 +13147,8 @@ void emitter::emitDispIns(
case IF_RRW_RRD_RRD:
case IF_RWR_RWR_RRD:
{
- assert(IsVexOrEvexEncodableInstruction(ins));
- assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
+ assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins));
regNumber reg2 = id->idReg2();
regNumber reg3 = id->idReg3();
@@ -12630,6 +13383,19 @@ void emitter::emitDispIns(
break;
}
+ case IF_RWR_RRD_SHF:
+ {
+ assert(IsApxExtendedEvexInstruction(id->idIns()));
+ printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr));
+
+ emitGetInsCns(id, &cnsVal);
+ val = cnsVal.cnsVal;
+
+ emitDispShift(ins, (BYTE)val);
+
+ break;
+ }
+
case IF_RRD_MRD:
case IF_RWR_MRD:
case IF_RRW_MRD:
@@ -13578,12 +14344,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
break;
case EA_2BYTE:
-
- /* Output a size prefix for a 16-bit operand */
-
- dst += emitOutputByte(dst, 0x66);
-
+ {
+ // Output a size prefix for a 16-bit operand
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(IsApxExtendedEvexInstruction(ins));
+ assert(hasEvexPrefix(code));
+ // Evex.pp should already be added when adding the prefix.
+ assert((code & EXTENDED_EVEX_PP_BITS) != 0);
+ }
+ else
+ {
+ dst += emitOutputByte(dst, 0x66);
+ }
FALLTHROUGH;
+ }
case EA_4BYTE:
#ifdef TARGET_AMD64
@@ -13627,7 +14402,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
}
else
{
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte);
}
@@ -14165,6 +14940,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
break;
+ case IF_RWR_RRD_ARD:
+ assert(((id->idGCref() == GCT_BYREF) &&
+ (ins == INS_add || ins == INS_sub || ins == INS_sub_hide || insIsCMOV(ins))) ||
+ ((id->idGCref() == GCT_GCREF) && insIsCMOV(ins)));
+ assert(id->idIsEvexNdContextSet());
+ emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
+ break;
+
case IF_ARD_RRD:
case IF_AWR_RRD:
break;
@@ -14411,25 +15194,45 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
switch (size)
{
case EA_1BYTE:
+#ifdef TARGET_AMD64
+ assert((ins != INS_lzcnt_apx) && (ins != INS_tzcnt_apx) && (ins != INS_popcnt_apx));
+#endif // TARGET_AMD64
break;
case EA_2BYTE:
// Output a size prefix for a 16-bit operand
- dst += emitOutputByte(dst, 0x66);
+ {
+ if (!TakesApxExtendedEvexPrefix(id))
+ {
+ dst += emitOutputByte(dst, 0x66);
+ }
+ }
FALLTHROUGH;
case EA_4BYTE:
+ code |= 0x01;
+ break;
+
#ifdef TARGET_AMD64
case EA_8BYTE:
-#endif // TARGET_AMD64
-
/* Set the 'w' size bit to indicate 32-bit operation
* Note that incrementing "code" for INS_call (0xFF) would
* overflow, whereas setting the lower bit to 1 just works out
*/
-
- code |= 0x01;
- break;
+ {
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(hasEvexPrefix(code));
+ code = AddRexWPrefix(id, code);
+ }
+ if ((ins != INS_lzcnt_apx) && (ins != INS_tzcnt_apx) && (ins != INS_popcnt_apx))
+ // These instructions do not support 1-byte inputs and the opcode is exact.
+ {
+ code |= 0x01;
+ }
+ break;
+ }
+#endif // TARGET_AMD64
#ifdef TARGET_X86
case EA_8BYTE:
@@ -14463,7 +15266,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// function, to which the remainder of the emitter logic should handle properly.
// TODO-XARCH-AVX512 : embedded broadcast might change this
int dspAsByte = dsp;
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
}
@@ -14517,7 +15320,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following
// function, to which the remainder of the emitter logic should handle properly.
// TODO-XARCH-AVX512 : embedded broadcast might change this
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte));
}
@@ -14664,6 +15467,15 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
break;
+ case IF_RWR_RRD_SRD: // Register Read/Write, Stack Read (So we need to update GC live for register)
+
+ // reg could have been a GCREF as GCREF + int=BYREF
+ // or BYREF+/-int=BYREF
+ assert(id->idGCref() == GCT_BYREF && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide));
+ assert(id->idIsEvexNdContextSet());
+ emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
+ break;
+
case IF_SRW_CNS:
case IF_SRW_RRD:
case IF_SRW_RRW:
@@ -15249,7 +16061,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
// Can't use the compact form, use the long form
ins = (instruction)(ins + 1);
- if (size == EA_2BYTE)
+ if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
{
// Output a size prefix for a 16-bit operand
dst += emitOutputByte(dst, 0x66);
@@ -15262,10 +16074,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
code |= 0x1;
}
- if (TakesRex2Prefix(id))
- {
- code = AddRex2Prefix(ins, code);
- }
+ code = AddX86PrefixIfNeeded(id, code, size);
if (TakesRexWPrefix(id))
{
@@ -15400,23 +16209,22 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
default:
assert(id->idGCref() == GCT_NONE);
-
- code = insEncodeMRreg(id, reg, size, insCodeMR(ins));
+ code = insCodeMR(ins);
+ code = AddX86PrefixIfNeeded(id, code, size);
+ code = insEncodeMRreg(id, reg, size, code);
if (size != EA_1BYTE)
{
// Set the 'w' bit to get the large version
code |= 0x1;
- if (size == EA_2BYTE)
+ if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
{
// Output a size prefix for a 16-bit operand
dst += emitOutputByte(dst, 0x66);
}
}
- code = AddX86PrefixIfNeeded(id, code, size);
-
if (TakesRexWPrefix(id))
{
code = AddRexWPrefix(id, code);
@@ -15553,7 +16361,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
}
#ifdef FEATURE_HW_INTRINSICS
else if ((ins == INS_bsf) || (ins == INS_bsr) || (ins == INS_crc32) || (ins == INS_lzcnt) || (ins == INS_popcnt) ||
- (ins == INS_tzcnt))
+ (ins == INS_tzcnt)
+#ifdef TARGET_AMD64
+ || (ins == INS_lzcnt_apx) || (ins == INS_tzcnt_apx) || (ins == INS_popcnt_apx)
+#endif // TARGET_AMD64
+ )
{
assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins));
code = insCodeRM(ins);
@@ -15564,7 +16376,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
code |= 0x0100;
}
- if (size == EA_2BYTE)
+ if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
{
assert(ins == INS_crc32);
dst += emitOutputByte(dst, 0x66);
@@ -15577,15 +16389,21 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
#endif // FEATURE_HW_INTRINSICS
else
{
+ // TODO-XArch-APX:
+ // some instructions with NDD form might go into this path with EVEX prefix.
+ // might consider having a separate path with checks like: TakesApxExtendedEvexPrefix
+ // essentially, we need to make it clear on the priority and necessity of REX2 and EVEX:
+ // REX2 is needed iff EGPRs are involved.
+ // EVEX is needed when NDD, NF or other features are involved.
+ // So the logic should be:
+ // checking if those new features are used, then check if EGPRs are involved.
+ // EGPRs will be supported by EVEX anyway, so don't need to check in the first place.
assert(!TakesSimdPrefix(id));
code = insCodeMR(ins);
- if (TakesRex2Prefix(id))
- {
- code = AddRex2Prefix(ins, code);
- }
+ code = AddX86PrefixIfNeeded(id, code, size);
code = insEncodeMRreg(id, code);
- if (ins != INS_test)
+ if (ins != INS_test && !IsShiftInstruction(ins))
{
code |= 2;
}
@@ -15599,7 +16417,17 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
case EA_2BYTE:
// Output a size prefix for a 16-bit operand
- dst += emitOutputByte(dst, 0x66);
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ assert(IsApxExtendedEvexInstruction(ins));
+ assert(hasEvexPrefix(code));
+ // Evex.pp should already be added when adding the prefix.
+ assert((code & EXTENDED_EVEX_PP_BITS) != 0);
+ }
+ else
+ {
+ dst += emitOutputByte(dst, 0x66);
+ }
FALLTHROUGH;
case EA_4BYTE:
@@ -15650,8 +16478,18 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
}
}
- unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code);
- regCode |= insEncodeReg012(id, regFor012Bits, size, &code);
+ unsigned regCode;
+ if (!id->idIsEvexNdContextSet() || !IsApxNDDEncodableInstruction(ins))
+ {
+ regCode = insEncodeReg345(id, regFor345Bits, size, &code);
+ regCode |= insEncodeReg012(id, regFor012Bits, size, &code);
+ }
+ else
+ {
+ // unary ins with NDD form use Evex.vvvvv for dst, and ModRM.rm for src
+ code = insEncodeReg3456(id, reg1, size, code);
+ regCode = insEncodeReg012(id, reg2, size, &code);
+ }
if (TakesSimdPrefix(id))
{
@@ -15709,6 +16547,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
dst += emitOutputByte(dst, (code >> 8) & 0xFF);
dst += emitOutputByte(dst, (0xC0 | regCode));
}
+ else if (IsApxNDDEncodableInstruction(ins) && id->idIsEvexNdContextSet())
+ {
+ dst += emitOutputByte(dst, (code & 0xFF));
+ dst += emitOutputByte(dst, (0xC0 | regCode | (code >> 8)));
+ }
else
{
dst += emitOutputWord(dst, code);
@@ -15718,155 +16561,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
// Does this instruction operate on a GC ref value?
if (id->idGCref())
{
- switch (id->idInsFmt())
- {
- case IF_RRD_RRD:
- break;
-
- case IF_RWR_RRD:
- {
- if (emitSyncThisObjReg != REG_NA && emitIGisInProlog(emitCurIG) && reg2 == (int)REG_ARG_0)
- {
- // We're relocating "this" in the prolog
- assert(emitComp->lvaIsOriginalThisArg(0));
- assert(emitComp->lvaTable[0].lvRegister);
- assert(emitComp->lvaTable[0].GetRegNum() == reg1);
-
- if (emitFullGCinfo)
- {
- emitGCregLiveSet(id->idGCref(), genRegMask(reg1), dst, true);
- break;
- }
- else
- {
- /* If emitFullGCinfo==false, the we don't use any
- regPtrDsc's and so explicitly note the location
- of "this" in GCEncode.cpp
- */
- }
- }
-
- emitGCregLiveUpd(id->idGCref(), reg1, dst);
- break;
- }
-
- case IF_RRW_RRD:
- {
- switch (id->idIns())
- {
- /*
- This must be one of the following cases:
-
- xor reg, reg to assign NULL
-
- and r1 , r2 if (ptr1 && ptr2) ...
- or r1 , r2 if (ptr1 || ptr2) ...
-
- add r1 , r2 to compute a normal byref
- sub r1 , r2 to compute a strange byref (VC only)
-
- */
- case INS_xor:
- assert(reg1 == reg2);
- emitGCregLiveUpd(id->idGCref(), reg1, dst);
- break;
-
- case INS_or:
- case INS_and:
- emitGCregDeadUpd(reg1, dst);
- break;
-
- case INS_add:
- case INS_sub:
- case INS_sub_hide:
- assert(id->idGCref() == GCT_BYREF);
-
-#if 0
-#ifdef DEBUG
- // Due to elided register moves, we can't have the following assert.
- // For example, consider:
- // t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
- // /--* t85 byref
- // * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
- // Here, V01 is type `long` on entry, then is stored as a byref. But because
- // the register allocator assigned the same register, no instruction was
- // generated, and we only (currently) make gcref/byref changes in emitter GC info
- // when an instruction is generated. We still generate correct GC info, as this
- // instruction, if writing a GC ref even through reading a long, will go live here.
- // These situations typically occur due to unsafe casting, such as with Span.
-
- regMaskTP regMask;
- regMask = genRegMask(reg1) | genRegMask(reg2);
-
- // r1/r2 could have been a GCREF as GCREF + int=BYREF
- // or BYREF+/-int=BYREF
- assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) ||
- ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)));
-#endif // DEBUG
-#endif // 0
-
- // Mark r1 as holding a byref
- emitGCregLiveUpd(GCT_BYREF, reg1, dst);
- break;
-
- default:
-#ifdef DEBUG
- emitDispIns(id, false, false, false);
-#endif
- assert(!"unexpected GC reg update instruction");
- }
-
- break;
- }
-
- case IF_RRW_RRW:
- {
- // This must be "xchg reg1, reg2"
- assert(id->idIns() == INS_xchg);
-
- // If we got here, the GC-ness of the registers doesn't match, so we have to "swap" them in the GC
- // register pointer mask.
-
- GCtype gc1, gc2;
-
- gc1 = emitRegGCtype(reg1);
- gc2 = emitRegGCtype(reg2);
-
- if (gc1 != gc2)
- {
- // Kill the GC-info about the GC registers
-
- if (needsGC(gc1))
- {
- emitGCregDeadUpd(reg1, dst);
- }
-
- if (needsGC(gc2))
- {
- emitGCregDeadUpd(reg2, dst);
- }
-
- // Now, swap the info
-
- if (needsGC(gc1))
- {
- emitGCregLiveUpd(gc1, reg2, dst);
- }
-
- if (needsGC(gc2))
- {
- emitGCregLiveUpd(gc2, reg1, dst);
- }
- }
- break;
- }
-
- default:
-#ifdef DEBUG
- emitDispIns(id, false, false, false);
-#endif
- assert(!"unexpected GC ref instruction format");
- }
+ emitHandleGCrefRegs(dst, id);
}
else
{
@@ -15911,8 +16606,9 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
code_t code;
instruction ins = id->idIns();
- assert(IsVexOrEvexEncodableInstruction(ins));
- assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins));
+ assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+ assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins) ||
+ IsApxExtendedEvexInstruction(ins));
regNumber targetReg = id->idReg1();
regNumber src1 = id->idReg2();
regNumber src2 = id->idReg3();
@@ -15921,6 +16617,51 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
code = insCodeRM(ins);
code = AddX86PrefixIfNeeded(id, code, size);
+ if (IsApxExtendedEvexInstruction(ins) && !IsBMIInstruction(ins))
+ {
+ // TODO-XArch-apx:
+ // For rm-like operand encoding instructions:
+ // legacy promoted EVEX encoding has introduced different semantic:
+ // op1 - vvvvv
+ // op2 - MODRM.REG
+ // op3 - MODRM.R/M
+ regNumber tmp = src1;
+ src1 = targetReg;
+ targetReg = tmp;
+
+ switch (size)
+ {
+ case EA_1BYTE:
+ // TODO-APX : verify We should never end up here. Atleast for instructions I have looked into, we
+ // promote to int to do operation
+ noway_assert(RBM_BYTE_REGS & genRegMask(src1));
+ noway_assert(RBM_BYTE_REGS & genRegMask(src2));
+ noway_assert(RBM_BYTE_REGS & genRegMask(targetReg));
+ break;
+
+ case EA_2BYTE:
+ case EA_4BYTE:
+ // Set the 'w' bit to get the large version
+ code = insIsCMOV(ins) ? code : (code | (0x01));
+ break;
+
+#ifdef TARGET_AMD64
+ case EA_8BYTE:
+ // TODO-AMD64-CQ: Better way to not emit REX.W when we don't need it
+ // Don't need to zero out the high bits explicitly
+ code = AddRexWPrefix(id, code); // TODO-APX : Revisit. does xor or other cases need to be handled
+ // differently? see emitOutputRR
+ // Set the 'w' bit to get the large version
+ code = insIsCMOV(ins) ? code : (code | (0x01));
+ break;
+
+#endif // TARGET_AMD64
+
+ default:
+ assert(!"unexpected size");
+ }
+ }
+
code = insEncodeRMreg(id, code);
if (TakesRexWPrefix(id))
@@ -15968,7 +16709,10 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
dst += emitOutputByte(dst, (0xC0 | regCode));
}
- noway_assert(!id->idGCref());
+ if (id->idGCref())
+ {
+ emitHandleGCrefRegs(dst, id);
+ }
if (!emitInsCanOnlyWriteSSE2OrAVXReg(id))
{
@@ -16151,6 +16895,12 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
useACC = true;
}
}
+
+ if (TakesApxExtendedEvexPrefix(id))
+ {
+ // ACC form does not have support for promoted EVEX.
+ useACC = false;
+ }
}
else
{
@@ -16206,7 +16956,10 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
case EA_2BYTE:
// Output a size prefix for a 16-bit operand
- dst += emitOutputByte(dst, 0x66);
+ if (!TakesApxExtendedEvexPrefix(id))
+ {
+ dst += emitOutputByte(dst, 0x66);
+ }
FALLTHROUGH;
case EA_4BYTE:
@@ -16884,7 +17637,23 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const
//
ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte)
{
- assert(TakesEvexPrefix(id));
+ assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id));
+
+ if (!hasTupleTypeInfo(id->idIns()))
+ {
+ // After APX, some instructions with APX features will be promoted
+ // to APX-EVEX, we will re-use the existing displacement emitting
+ // path, but for those instructions with no tuple information,
+ // APX-EVEX treat the scaling factor to be 1 constantly.
+ instruction ins = id->idIns();
+ // TODO-XArch-APX:
+ // This assert may need tweak if BMI1 instructions are promoted
+ // into EVEX for multiple features, currently only EVEX.NF.
+ assert(IsApxExtendedEvexInstruction(id->idIns()));
+ *dspInByte = ((signed char)dsp == (ssize_t)dsp);
+ return dsp;
+ }
+
insTupleType tt = insTupleTypeInfo(id->idIns());
assert(hasTupleTypeInfo(id->idIns()));
@@ -17539,7 +18308,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
}
// Output a size prefix for a 16-bit operand
- if (size == EA_2BYTE)
+ if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id))
{
dst += emitOutputByte(dst, 0x66);
}
@@ -17555,6 +18324,37 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
break;
}
+ case IF_RWR_RRD_SHF:
+ {
+ assert(IsApxExtendedEvexInstruction(ins));
+ code = insCodeMR(ins);
+ code = AddX86PrefixIfNeeded(id, code, size);
+ code = insEncodeMRreg(id, id->idReg2(), size, code);
+ code = insEncodeReg3456(id, id->idReg1(), size, code);
+
+ // set the W bit
+ if (size != EA_1BYTE)
+ {
+ code |= 1;
+ }
+
+ // Emit the REX prefix if it exists
+ if (TakesRexWPrefix(id))
+ {
+ code = AddRexWPrefix(id, code);
+ }
+
+ dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
+ dst += emitOutputWord(dst, code);
+ dst += emitOutputByte(dst, emitGetInsSC(id));
+ sz = emitSizeOfInsDsc_CNS(id);
+
+ // Update GC info.
+ assert(!id->idGCref());
+ emitGCregDeadUpd(id->idReg1(), dst);
+ break;
+ }
+
case IF_RRD_RRD:
case IF_RWR_RRD:
case IF_RRW_RRD:
@@ -17628,7 +18428,105 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
// Also, determine which operand goes where in the ModRM byte.
regNumber mReg;
regNumber rReg;
- if (hasCodeMR(ins))
+ if (IsApxExtendedEvexInstruction(ins))
+ {
+ assert(hasCodeMI(ins));
+ code = insCodeMI(ins);
+ code = AddX86PrefixIfNeeded(id, code, size);
+ code = insEncodeReg3456(id, id->idReg1(), size, code);
+ mReg = id->idReg2();
+ code = insEncodeMIreg(id, mReg, size, code);
+ rReg = REG_NA;
+ ssize_t val = emitGetInsSC(id);
+ bool valInByte = ((signed char)val == (target_ssize_t)val) && (ins != INS_mov) && (ins != INS_test);
+
+ switch (size)
+ {
+ case EA_1BYTE:
+ break;
+
+ case EA_2BYTE:
+ code |= EXTENDED_EVEX_PP_BITS;
+ FALLTHROUGH;
+
+ case EA_4BYTE:
+ code |= 1;
+ break;
+
+#ifdef TARGET_AMD64
+ case EA_8BYTE:
+ code = AddRexWPrefix(id, code);
+ code |= 1;
+ break;
+#endif // TARGET_AMD64
+
+ default:
+ assert(!"unexpected size");
+ }
+
+ dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code);
+
+ if (valInByte && size > EA_1BYTE)
+ {
+ code |= 2;
+ dst += emitOutputWord(dst, code);
+ dst += emitOutputByte(dst, val);
+ }
+ else
+ {
+ dst += emitOutputWord(dst, code);
+ switch (size)
+ {
+ case EA_1BYTE:
+ dst += emitOutputByte(dst, val);
+ break;
+ case EA_2BYTE:
+ dst += emitOutputWord(dst, val);
+ break;
+ case EA_4BYTE:
+ dst += emitOutputLong(dst, val);
+ break;
+#ifdef TARGET_AMD64
+ case EA_8BYTE:
+ dst += emitOutputLong(dst, val);
+ break;
+#endif // TARGET_AMD64
+ default:
+ break;
+ }
+
+ if (id->idIsCnsReloc())
+ {
+ emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW);
+ assert(size == EA_4BYTE);
+ }
+ }
+
+ sz = emitSizeOfInsDsc_CNS(id);
+
+ if (!emitInsCanOnlyWriteSSE2OrAVXReg(id))
+ {
+ emitGCregDeadUpd(id->idReg1(), dst);
+ }
+
+ switch (id->idInsFmt())
+ {
+ case IF_RWR_RRD_CNS:
+ assert(!instrIs3opImul(ins));
+
+ emitGCregDeadUpd(id->idReg1(), dst);
+ break;
+
+ default:
+#ifdef DEBUG
+ emitDispIns(id, false, false, false);
+#endif
+ assert(!"unexpected GC ref instruction format");
+ }
+
+ break;
+ }
+ else if (hasCodeMR(ins))
{
code = insCodeMR(ins);
// Emit the VEX prefix if it exists
@@ -17863,6 +18761,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
{
code = insCodeRM(ins);
+ if (id->idIsEvexNdContextSet() && TakesApxExtendedEvexPrefix(id))
+ {
+ // TODO-XArch-APX:
+ // I'm not sure why instructions on this path can be with instruction
+ // format other than IF_RWR_RRD_ARD, fix here for debug purpose only,
+ // need revisit.
+ id->idInsFmt(IF_RWR_RRD_ARD);
+
+ code = AddX86PrefixIfNeeded(id, code, size);
+ code = insEncodeReg3456(id, id->idReg1(), size, code);
+ regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8);
+ dst = emitOutputAM(dst, id, code | regcode);
+
+ sz = emitSizeOfInsDsc_AMD(id);
+ break;
+ }
+
if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32))
{
// Special case 4-byte AVX instructions as the
@@ -18130,7 +19045,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
case IF_RRW_RRD_SRD:
case IF_RWR_RWR_SRD:
{
- assert(IsVexOrEvexEncodableInstruction(ins));
+ assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins));
+
+ if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins))
+ {
+ // EVEX.vvvv has different semantic for APX-EVEX NDD instructions.
+ code = insCodeRM(ins);
+ code = AddX86PrefixIfNeeded(id, code, size);
+ code = insEncodeReg3456(id, id->idReg1(), size, code);
+ regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8);
+ dst = emitOutputSV(dst, id, code | regcode);
+ sz = sizeof(instrDesc);
+ break;
+ }
code = insCodeRM(ins);
code = AddX86PrefixIfNeeded(id, code, size);
@@ -19195,6 +20122,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
case IF_RRW:
+ // TODO-XArch-APX: to be verified if this data is correct for NDD form.
+ case IF_RWR_RRD:
// ins reg, cl
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
result.insLatency = PERFSCORE_LATENCY_2C;
@@ -19222,6 +20151,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
switch (insFmt)
{
case IF_RRW:
+ // TODO-XArch-APX: to be verified if this data is correct for NDD form.
+ case IF_RWR_RRD:
// ins reg, 1
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
break;
@@ -19255,6 +20186,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
switch (insFmt)
{
case IF_RRW_SHF:
+ case IF_RWR_RRD_SHF:
// ins reg, cns
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
break;
@@ -20225,6 +21157,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vshuff64x2:
case INS_vshufi32x4:
case INS_vshufi64x2:
+#ifdef TARGET_AMD64
+ case INS_popcnt_apx:
+ case INS_lzcnt_apx:
+ case INS_tzcnt_apx:
+#endif // TARGET_AMD64
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_3C;
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 5f820c7c022c20..8e149ed5be3389 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -134,11 +134,18 @@ static regNumber getSseShiftRegNumber(instruction ins);
bool HasVexEncoding(instruction ins) const;
bool HasEvexEncoding(instruction ins) const;
bool HasRex2Encoding(instruction ins) const;
+bool HasApxNdd(instruction ins) const;
+bool HasApxNf(instruction ins) const;
bool IsVexEncodableInstruction(instruction ins) const;
bool IsEvexEncodableInstruction(instruction ins) const;
bool IsRex2EncodableInstruction(instruction ins) const;
+bool IsApxNDDEncodableInstruction(instruction ins) const;
+bool IsApxNFEncodableInstruction(instruction ins) const;
+bool IsApxExtendedEvexInstruction(instruction ins) const;
+bool IsShiftInstruction(instruction ins) const;
bool IsLegacyMap1(code_t code) const;
bool IsVexOrEvexEncodableInstruction(instruction ins) const;
+bool DoJitUseApxNDD(instruction ins) const;
code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code);
@@ -179,6 +186,8 @@ bool AreFlagsSetForSignJumpOpt(regNumber reg, emitAttr opSize, GenCondition cond
insOpts GetEmbRoundingMode(uint8_t mode) const;
+void emitHandleGCrefRegs(BYTE* dst, instrDesc* id);
+
bool hasRexPrefix(code_t code)
{
#ifdef TARGET_AMD64
@@ -332,6 +341,18 @@ void SetUseRex2Encoding(bool value)
useRex2Encodings = value;
}
+// Is Promoted EVEX encoding supported.
+bool usePromotedEVEXEncodings;
+bool UsePromotedEVEXEncoding() const
+{
+ return usePromotedEVEXEncodings;
+}
+
+void SetUsePromotedEVEXEncoding(bool value)
+{
+ usePromotedEVEXEncodings = value;
+}
+
//------------------------------------------------------------------------
// UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported
// contains Evex prefix.
@@ -349,6 +370,7 @@ bool UseSimdEncoding() const
#define EVEX_PREFIX_CODE 0x6200000000000000ULL
bool TakesEvexPrefix(const instrDesc* id) const;
+bool TakesApxExtendedEvexPrefix(const instrDesc* id) const;
//------------------------------------------------------------------------
// hasEvexPrefix: Returns true if the instruction encoding already
@@ -405,11 +427,7 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size)
//
code_t AddX86PrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size)
{
- // TODO-xarch-apx:
- // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality
- // of these functions are overlapping.
-
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
return AddEvexPrefix(id, code, size);
}
@@ -445,7 +463,7 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA
// consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality
// of these functions are overlapping.
- if (TakesEvexPrefix(id))
+ if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id))
{
return !hasEvexPrefix(code) ? AddEvexPrefix(id, code, size) : code;
}
@@ -511,6 +529,48 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions)
}
}
+//------------------------------------------------------------------------
+// SetEvexNdIfNeeded: set NDD form - new data destination if needed.
+//
+// Arguments:
+// id - instruction descriptor
+// instOptions - emit options
+//
+void SetEvexNdIfNeeded(instrDesc* id, insOpts instOptions)
+{
+ if ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0)
+ {
+ assert(UsePromotedEVEXEncoding());
+ assert(IsApxNDDEncodableInstruction(id->idIns()));
+ id->idSetEvexNdContext();
+ }
+ else
+ {
+ assert((instOptions & INS_OPTS_EVEX_nd_MASK) == 0);
+ }
+}
+
+//------------------------------------------------------------------------
+// SetEvexNdIfNeeded: set Evex.nf on instrDesc
+//
+// Arguments:
+// id - instruction descriptor
+// instOptions - emit options
+//
+void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions)
+{
+ if ((instOptions & INS_OPTS_EVEX_nf_MASK) != 0)
+ {
+ assert(UsePromotedEVEXEncoding());
+ assert(IsApxNFEncodableInstruction(id->idIns()));
+ id->idSetEvexNfContext();
+ }
+ else
+ {
+ assert((instOptions & INS_OPTS_EVEX_nf_MASK) == 0);
+ }
+}
+
//------------------------------------------------------------------------
// AddSimdPrefixIfNeeded: Add the correct SIMD prefix.
// Check if the prefix already exists befpre adding.
@@ -753,7 +813,7 @@ void emitIns_Data16();
void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val);
-void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
+void emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions = INS_OPTS_NONE);
void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs);
@@ -762,7 +822,9 @@ void emitIns_A(instruction ins, emitAttr attr, GenTreeIndir* indir);
void emitIns_R_I(instruction ins,
emitAttr attr,
regNumber reg,
- ssize_t val DEBUGARG(size_t targetHandle = 0) DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));
+ ssize_t val,
+ insOpts instOptions = INS_OPTS_NONE DEBUGARG(size_t targetHandle = 0)
+ DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));
void emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regNumber srgReg, bool canSkip);
@@ -771,7 +833,7 @@ void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2,
void emitIns_R_R_I(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival, insOpts instOptions = INS_OPTS_NONE);
-void emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs);
+void emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs, insOpts instOptions = INS_OPTS_NONE);
void emitIns_AR_R_R(instruction ins,
emitAttr attr,
@@ -942,7 +1004,8 @@ void emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNumber reg)
void emitIns_R_D(instruction ins, emitAttr attr, unsigned offs, regNumber reg);
-void emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int offs);
+void emitIns_I_AR(
+ instruction ins, emitAttr attr, int val, regNumber reg, int offs, insOpts instOptions = INS_OPTS_NONE);
void emitIns_I_AI(instruction ins, emitAttr attr, int val, ssize_t disp);
@@ -953,7 +1016,12 @@ void emitIns_R_AI(instruction ins,
regNumber ireg,
ssize_t disp DEBUGARG(size_t targetHandle = 0) DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));
-void emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp);
+void emitIns_AR_R(instruction ins,
+ emitAttr attr,
+ regNumber reg,
+ regNumber base,
+ cnsval_ssize_t disp,
+ insOpts instOptions = INS_OPTS_NONE);
void emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp);
@@ -974,7 +1042,8 @@ void emitIns_ARX_R(instruction ins,
regNumber base,
regNumber index,
unsigned scale,
- cnsval_ssize_t disp);
+ cnsval_ssize_t disp,
+ insOpts instOptions = INS_OPTS_NONE);
void emitIns_I_AX(instruction ins, emitAttr attr, int val, regNumber reg, unsigned mul, int disp);
@@ -1122,6 +1191,13 @@ void emitIns_SIMD_R_R_R_S_I(instruction ins,
insOpts instOptions);
#endif // FEATURE_HW_INTRINSICS
+void emitIns_BASE_R_R(instruction ins, emitAttr attr, regNumber op1Reg, regNumber op2Reg);
+
+void emitIns_BASE_R_R_I(instruction ins, emitAttr attr, regNumber op1Reg, regNumber op2Reg, int ival);
+
+regNumber emitIns_BASE_R_R_RM(
+ instruction ins, emitAttr attr, regNumber targetReg, GenTree* treeNode, GenTree* regOp, GenTree* rmOp);
+
enum EmitCallType
{
EC_FUNC_TOKEN, // Direct call to a helper/static/nonvirtual/global method (call addr with RIP-relative encoding)
diff --git a/src/coreclr/jit/fgehopt.cpp b/src/coreclr/jit/fgehopt.cpp
index d9710994cf152b..940077dc6d2d62 100644
--- a/src/coreclr/jit/fgehopt.cpp
+++ b/src/coreclr/jit/fgehopt.cpp
@@ -2761,14 +2761,20 @@ BasicBlock* Compiler::fgCloneTryRegion(BasicBlock* tryEntry, CloneTryInfo& info,
// this is cheaper than any other insertion point, as no existing regions get renumbered.
//
unsigned insertBeforeIndex = enclosingTryIndex;
- if (insertBeforeIndex == EHblkDsc::NO_ENCLOSING_INDEX)
+ if ((enclosingTryIndex == EHblkDsc::NO_ENCLOSING_INDEX) && (enclosingHndIndex == EHblkDsc::NO_ENCLOSING_INDEX))
{
- JITDUMP("Cloned EH clauses will go at the end of the EH table\n");
+ JITDUMP("No enclosing EH region; cloned EH clauses will go at the end of the EH table\n");
insertBeforeIndex = compHndBBtabCount;
}
+ else if ((enclosingTryIndex == EHblkDsc::NO_ENCLOSING_INDEX) || (enclosingHndIndex < enclosingTryIndex))
+ {
+ JITDUMP("Cloned EH clauses will go before enclosing handler region EH#%02u\n", enclosingHndIndex);
+ insertBeforeIndex = enclosingHndIndex;
+ }
else
{
- JITDUMP("Cloned EH clauses will go before enclosing region EH#%02u\n", enclosingTryIndex);
+ JITDUMP("Cloned EH clauses will go before enclosing try region EH#%02u\n", enclosingTryIndex);
+ assert(insertBeforeIndex == enclosingTryIndex);
}
// Once we call fgTryAddEHTableEntries with deferCloning = false,
@@ -2989,7 +2995,7 @@ BasicBlock* Compiler::fgCloneTryRegion(BasicBlock* tryEntry, CloneTryInfo& info,
const unsigned originalTryIndex = block->getTryIndex();
unsigned cloneTryIndex = originalTryIndex;
- if (originalTryIndex <= outermostTryIndex)
+ if (originalTryIndex < enclosingTryIndex)
{
cloneTryIndex += indexShift;
}
@@ -3003,11 +3009,15 @@ BasicBlock* Compiler::fgCloneTryRegion(BasicBlock* tryEntry, CloneTryInfo& info,
if (block->hasHndIndex())
{
const unsigned originalHndIndex = block->getHndIndex();
+ unsigned cloneHndIndex = originalHndIndex;
+
+ if (originalHndIndex < enclosingHndIndex)
+ {
+ cloneHndIndex += indexShift;
+ }
- // if (originalHndIndex ==
- const unsigned cloneHndIndex = originalHndIndex + indexShift;
- EHblkDsc* const originalEbd = ehGetDsc(originalHndIndex);
- EHblkDsc* const clonedEbd = ehGetDsc(cloneHndIndex);
+ EHblkDsc* const originalEbd = ehGetDsc(originalHndIndex);
+ EHblkDsc* const clonedEbd = ehGetDsc(cloneHndIndex);
newBlock->setHndIndex(cloneHndIndex);
updateBlockReferences(cloneHndIndex);
diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp
index 6187274c70332f..2c0ca450bc8243 100644
--- a/src/coreclr/jit/fgopt.cpp
+++ b/src/coreclr/jit/fgopt.cpp
@@ -2213,27 +2213,6 @@ bool Compiler::fgOptimizeUncondBranchToSimpleCond(BasicBlock* block, BasicBlock*
// At this point we know target is BBJ_COND.
assert(target->KindIs(BBJ_COND));
- // Bail out if OSR, as we can have unusual flow into loops. If one
- // of target's successors is also a backedge target, this optimization
- // may mess up loop recognition by creating too many non-loop preds.
- //
- if (opts.IsOSR())
- {
- if (target->GetFalseTarget()->HasFlag(BBF_BACKWARD_JUMP_TARGET))
- {
- JITDUMP("Deferring: " FMT_BB " --> " FMT_BB "; latter looks like loop top\n", target->bbNum,
- target->GetFalseTarget()->bbNum);
- return false;
- }
-
- if (target->GetTrueTarget()->HasFlag(BBF_BACKWARD_JUMP_TARGET))
- {
- JITDUMP("Deferring: " FMT_BB " --> " FMT_BB "; latter looks like loop top\n", target->bbNum,
- target->GetTrueTarget()->bbNum);
- return false;
- }
- }
-
// See if this block assigns constant or other interesting tree to that same local.
//
if (!fgBlockEndFavorsTailDuplication(block, lclNum))
@@ -4952,7 +4931,6 @@ Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp)
, blockOrder(nullptr)
, tempOrder(nullptr)
, numCandidateBlocks(0)
- , currEHRegion(0)
{
}
@@ -5146,7 +5124,7 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge)
BasicBlock* const dstBlk = edge->getDestinationBlock();
// Ignore cross-region branches
- if ((srcBlk->bbTryIndex != currEHRegion) || (dstBlk->bbTryIndex != currEHRegion))
+ if (!BasicBlock::sameTryRegion(srcBlk, dstBlk))
{
return;
}
@@ -5245,8 +5223,7 @@ void Compiler::ThreeOptLayout::AddNonFallthroughPreds(unsigned blockPos)
}
//-----------------------------------------------------------------------------
-// Compiler::ThreeOptLayout::Run: Runs 3-opt for each contiguous region of the block list
-// we're interested in reordering.
+// Compiler::ThreeOptLayout::Run: Runs 3-opt on the candidate span of hot blocks.
// We skip reordering handler regions for now, as these are assumed to be cold.
//
void Compiler::ThreeOptLayout::Run()
@@ -5292,41 +5269,9 @@ void Compiler::ThreeOptLayout::Run()
// Repurpose 'bbPostorderNum' for the block's ordinal
block->bbPostorderNum = numCandidateBlocks++;
-
- // While walking the span of blocks to reorder,
- // remember where each try region ends within this span.
- // We'll use this information to run 3-opt per region.
- EHblkDsc* const HBtab = compiler->ehGetBlockTryDsc(block);
- if (HBtab != nullptr)
- {
- HBtab->ebdTryLast = block;
- }
- }
-
- // Reorder try regions first
- bool modified = false;
- for (EHblkDsc* const HBtab : EHClauses(compiler))
- {
- // If multiple region indices map to the same region,
- // make sure we reorder its blocks only once
- BasicBlock* const tryBeg = HBtab->ebdTryBeg;
- if (tryBeg->getTryIndex() != currEHRegion++)
- {
- continue;
- }
-
- // Only reorder try regions within the candidate span of blocks
- if ((tryBeg->bbPostorderNum < numCandidateBlocks) && (blockOrder[tryBeg->bbPostorderNum] == tryBeg))
- {
- JITDUMP("Running 3-opt for try region #%d\n", (currEHRegion - 1));
- modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast);
- }
}
- // Finally, reorder the main method body
- currEHRegion = 0;
- JITDUMP("Running 3-opt for main method body\n");
- modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numCandidateBlocks - 1]);
+ const bool modified = RunThreeOptPass();
if (modified)
{
@@ -5335,14 +5280,25 @@ void Compiler::ThreeOptLayout::Run()
BasicBlock* const block = blockOrder[i - 1];
BasicBlock* const next = blockOrder[i];
+ if (block->NextIs(next))
+ {
+ continue;
+ }
+
// Only reorder within EH regions to maintain contiguity.
- // TODO: Allow moving blocks in different regions when 'next' is the region entry.
- // This would allow us to move entire regions up/down because of the contiguity requirement.
- if (!block->NextIs(next) && BasicBlock::sameEHRegion(block, next))
+ if (!BasicBlock::sameEHRegion(block, next))
+ {
+ continue;
+ }
+
+ // Don't move the entry of an EH region.
+ if (compiler->bbIsTryBeg(next) || compiler->bbIsHandlerBeg(next))
{
- compiler->fgUnlinkBlock(next);
- compiler->fgInsertBBafter(block, next);
+ continue;
}
+
+ compiler->fgUnlinkBlock(next);
+ compiler->fgInsertBBafter(block, next);
}
}
}
@@ -5487,12 +5443,6 @@ bool Compiler::ThreeOptLayout::RunGreedyThreeOptPass(unsigned startPos, unsigned
continue;
}
- // Don't consider any cut points that would disturb other EH regions
- if (!BasicBlock::sameEHRegion(s2Block, s3Block))
- {
- continue;
- }
-
// Compute the cost delta of this partition
const weight_t currCost = currCostBase + GetCost(s3BlockPrev, s3Block);
const weight_t newCost =
@@ -5550,22 +5500,15 @@ bool Compiler::ThreeOptLayout::RunGreedyThreeOptPass(unsigned startPos, unsigned
}
//-----------------------------------------------------------------------------
-// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range.
-//
-// Parameters:
-// startBlock - The first block of the range to reorder
-// endBlock - The last block (inclusive) of the range to reorder
+// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt on the candidate span of blocks.
//
// Returns:
// True if we reordered anything, false otherwise
//
-bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock)
+bool Compiler::ThreeOptLayout::RunThreeOptPass()
{
- assert(startBlock != nullptr);
- assert(endBlock != nullptr);
-
- const unsigned startPos = startBlock->bbPostorderNum;
- const unsigned endPos = endBlock->bbPostorderNum;
+ const unsigned startPos = 0;
+ const unsigned endPos = numCandidateBlocks - 1;
const unsigned numBlocks = (endPos - startPos + 1);
assert(startPos <= endPos);
diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp
index 2a21e61f0d3a09..63634dc2edeb54 100644
--- a/src/coreclr/jit/fgprofile.cpp
+++ b/src/coreclr/jit/fgprofile.cpp
@@ -4210,10 +4210,7 @@ bool Compiler::fgIncorporateEdgeCounts()
//
PhaseStatus Compiler::fgComputeBlockWeights()
{
- const bool usingProfileWeights = fgIsUsingProfileWeights();
- bool madeChanges = false;
- fgModified = false;
- fgCalledCount = BB_UNITY_WEIGHT;
+ fgModified = false;
#if DEBUG
if (verbose)
@@ -4223,40 +4220,38 @@ PhaseStatus Compiler::fgComputeBlockWeights()
}
#endif // DEBUG
- weight_t returnWeight = BB_UNITY_WEIGHT;
-
- madeChanges |= fgComputeMissingBlockWeights(&returnWeight);
-
- if (usingProfileWeights)
+ if (fgIsUsingProfileWeights())
{
- madeChanges |= fgComputeCalledCount(returnWeight);
- }
- else
- {
- JITDUMP(" -- no profile data, so using default called count\n");
+ // Compute fgCalledCount by subtracting any non-entry flow into fgFirstBB from its weight
+ fgCalledCount = fgFirstBB->bbWeight;
+ for (FlowEdge* const predEdge : fgFirstBB->PredEdges())
+ {
+ fgCalledCount = max(BB_ZERO_WEIGHT, fgCalledCount - predEdge->getLikelyWeight());
+ }
+
+ JITDUMP("We are using the profile weights and fgCalledCount is " FMT_WT "\n", fgCalledCount);
+ return PhaseStatus::MODIFIED_NOTHING;
}
- return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING;
+ JITDUMP(" -- no profile data, so using default called count\n");
+ fgCalledCount = BB_UNITY_WEIGHT;
+ return fgComputeMissingBlockWeights() ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING;
}
//-------------------------------------------------------------
// fgComputeMissingBlockWeights: determine weights for blocks
// that were not profiled and do not yet have weights.
//
-// Arguments
-// returnWeight [out] - sum of weights for all return and throw blocks
-//
// Returns:
// true if any changes made
//
-bool Compiler::fgComputeMissingBlockWeights(weight_t* returnWeight)
+bool Compiler::fgComputeMissingBlockWeights()
{
BasicBlock* bSrc;
BasicBlock* bDst;
unsigned iterations = 0;
bool changed;
bool modified = false;
- weight_t weight;
// If we have any blocks that did not have profile derived weight
// we will try to fix their weight up here
@@ -4265,7 +4260,6 @@ bool Compiler::fgComputeMissingBlockWeights(weight_t* returnWeight)
do // while (changed)
{
changed = false;
- weight = 0;
iterations++;
for (bDst = fgFirstBB; bDst != nullptr; bDst = bDst->Next())
@@ -4376,14 +4370,6 @@ bool Compiler::fgComputeMissingBlockWeights(weight_t* returnWeight)
bDst->bbSetRunRarely();
}
}
-
- // Sum up the weights of all of the return blocks and throw blocks
- // This is used when we have a back-edge into block 1
- //
- if (bDst->hasProfileWeight() && bDst->KindIs(BBJ_RETURN, BBJ_THROW))
- {
- weight += bDst->bbWeight;
- }
}
}
// Generally when we synthesize profile estimates we do it in a way where this algorithm will converge
@@ -4400,84 +4386,9 @@ bool Compiler::fgComputeMissingBlockWeights(weight_t* returnWeight)
}
#endif
- *returnWeight = weight;
-
return modified;
}
-//-------------------------------------------------------------
-// fgComputeCalledCount: when profile information is in use,
-// compute fgCalledCount
-//
-// Argument:
-// returnWeight - sum of weights for all return and throw blocks
-//
-// Returns:
-// true if any changes were made
-//
-bool Compiler::fgComputeCalledCount(weight_t returnWeight)
-{
- // When we are not using profile data we have already setup fgCalledCount
- // only set it here if we are using profile data
- assert(fgIsUsingProfileWeights());
- bool madeChanges = false;
-
- BasicBlock* firstILBlock = fgFirstBB; // The first block for IL code (i.e. for the IL code at offset 0)
-
- // OSR methods can have complex entry flow, and so
- // for OSR we ensure fgFirstBB has plausible profile data.
- //
- if (!opts.IsOSR())
- {
- // Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
- //
- while (firstILBlock->HasFlag(BBF_INTERNAL))
- {
- firstILBlock = firstILBlock->Next();
- }
- }
-
- // The 'firstILBlock' is now expected to have a profile-derived weight
- assert(firstILBlock->hasProfileWeight());
-
- // If the first block only has one ref then we use its weight for fgCalledCount.
- // Otherwise we have backedges into the first block, so instead we use the sum
- // of the return block weights for fgCalledCount.
- //
- // If the profile data has a 0 for the returnWeight
- // (i.e. the function never returns because it always throws)
- // then just use the first block weight rather than 0.
- //
- if ((firstILBlock->countOfInEdges() == 1) || (returnWeight == BB_ZERO_WEIGHT))
- {
- fgCalledCount = firstILBlock->bbWeight;
- }
- else
- {
- fgCalledCount = returnWeight;
- }
-
- // If we allocated a scratch block as the first BB then we need
- // to set its profile-derived weight to be fgCalledCount
- if (fgFirstBB->HasFlag(BBF_INTERNAL))
- {
- fgFirstBB->setBBProfileWeight(fgCalledCount);
- madeChanges = true;
- JITDUMP("fgComputeCalledCount: Modified method entry weight. Data %s inconsistent.\n",
- fgPgoConsistent ? "is now" : "was already");
- fgPgoConsistent = false;
- }
-
-#if DEBUG
- if (verbose)
- {
- printf("We are using the Profile Weights and fgCalledCount is " FMT_WT "\n", fgCalledCount);
- }
-#endif
-
- return madeChanges;
-}
-
//------------------------------------------------------------------------
// fgProfileWeightsEqual: check if two profile weights are equal
// (or nearly so)
diff --git a/src/coreclr/jit/helperexpansion.cpp b/src/coreclr/jit/helperexpansion.cpp
index 8407fd469eff6c..1d1eb069819e75 100644
--- a/src/coreclr/jit/helperexpansion.cpp
+++ b/src/coreclr/jit/helperexpansion.cpp
@@ -744,7 +744,7 @@ bool Compiler::fgExpandThreadLocalAccessForCallNativeAOT(BasicBlock** pBlock, St
fastPathBb->inheritWeight(prevBb);
// fallback will just execute first time
- fallbackBb->bbSetRunRarely();
+ fallbackBb->inheritWeightPercentage(tlsRootNullCondBB, 0);
fgRedirectTargetEdge(prevBb, tlsRootNullCondBB);
@@ -1180,7 +1180,7 @@ bool Compiler::fgExpandThreadLocalAccessForCall(BasicBlock** pBlock, Statement*
fastPathBb->inheritWeight(prevBb);
// fallback will just execute first time
- fallbackBb->bbSetRunRarely();
+ fallbackBb->inheritWeightPercentage(prevBb, 0);
// All blocks are expected to be in the same EH region
assert(BasicBlock::sameEHRegion(prevBb, block));
@@ -1545,7 +1545,7 @@ bool Compiler::fgExpandStaticInitForCall(BasicBlock** pBlock, Statement* stmt, G
block->inheritWeight(prevBb);
isInitedBb->inheritWeight(prevBb);
- helperCallBb->bbSetRunRarely();
+ helperCallBb->inheritWeightPercentage(isInitedBb, 0);
// All blocks are expected to be in the same EH region
assert(BasicBlock::sameEHRegion(prevBb, block));
@@ -1847,6 +1847,7 @@ bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock,
//
// Redirect prevBb to lengthCheckBb
fgRedirectTargetEdge(prevBb, lengthCheckBb);
+ lengthCheckBb->inheritWeight(prevBb);
assert(prevBb->JumpsToNext());
{
@@ -1859,6 +1860,11 @@ bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock,
// review: we assume length check always succeeds??
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);
+
+ if (lengthCheckBb->hasProfileWeight())
+ {
+ fastpathBb->setBBProfileWeight(falseEdge->getLikelyWeight());
+ }
}
{
@@ -1869,10 +1875,8 @@ bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock,
}
//
- // Re-distribute weights
+ // Ensure all flow out of prevBb converges into block
//
- lengthCheckBb->inheritWeight(prevBb);
- fastpathBb->inheritWeight(lengthCheckBb);
block->inheritWeight(prevBb);
// All blocks are expected to be in the same EH region
@@ -2551,11 +2555,18 @@ bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt,
trueEdge->setLikelihood(nullcheckTrueLikelihood);
}
+ // Set nullcheckBb's weight here, so we can propagate it to its successors below
+ nullcheckBb->inheritWeight(firstBb);
+
if (typeCheckNotNeeded)
{
FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, nullcheckBb);
nullcheckBb->SetFalseEdge(falseEdge);
falseEdge->setLikelihood(nullcheckFalseLikelihood);
+ fallbackBb->inheritWeight(nullcheckBb);
+ fallbackBb->scaleBBWeight(nullcheckFalseLikelihood);
+ lastBb->inheritWeight(nullcheckBb);
+ lastBb->scaleBBWeight(nullcheckTrueLikelihood);
typeCheckSucceedBb = nullptr;
}
@@ -2631,7 +2642,6 @@ bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt,
// The same goes for inherited weights -- the block where we test for B will have
// the weight of A times the likelihood that A's test fails, etc.
//
- nullcheckBb->inheritWeight(firstBb);
weight_t sumOfPreviousLikelihood = 0;
for (int candidateId = 0; candidateId < numOfCandidates; candidateId++)
{
@@ -2666,28 +2676,22 @@ bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt,
sumOfPreviousLikelihood += likelihood;
}
- if (fallbackBb->KindIs(BBJ_THROW))
- {
- fallbackBb->bbSetRunRarely();
- }
- else
+ fallbackBb->inheritWeight(lastTypeCheckBb);
+ fallbackBb->scaleBBWeight(lastTypeCheckBb->GetFalseEdge()->getLikelihood());
+
+ if (fallbackBb->KindIs(BBJ_ALWAYS))
{
- assert(fallbackBb->KindIs(BBJ_ALWAYS));
FlowEdge* const newEdge = fgAddRefPred(lastBb, fallbackBb);
fallbackBb->SetTargetEdge(newEdge);
- fallbackBb->inheritWeight(lastTypeCheckBb);
- weight_t lastTypeCheckFailedLikelihood = lastTypeCheckBb->GetFalseEdge()->getLikelihood();
- fallbackBb->scaleBBWeight(lastTypeCheckFailedLikelihood);
}
if (!typeCheckNotNeeded)
{
typeCheckSucceedBb->inheritWeight(typeChecksBbs[0]);
typeCheckSucceedBb->scaleBBWeight(sumOfPreviousLikelihood);
+ lastBb->inheritWeight(firstBb);
}
- lastBb->inheritWeight(firstBb);
-
//
// Validate EH regions
//
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index 4f00ca62627966..3ff3708785cd4b 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -221,6 +221,12 @@ enum insFlags : uint64_t
// APX: REX2 prefix:
Encoding_REX2 = 1ULL << 44,
+ // APX: EVEX.ND:
+ INS_Flags_Has_NDD = 1ULL << 45,
+
+ // APX: EVEX.NF:
+ INS_Flags_Has_NF = 1ULL << 46,
+
// TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH
INS_FLAGS_DONT_CARE = 0x00ULL,
};
@@ -259,6 +265,19 @@ enum insOpts: unsigned
INS_OPTS_EVEX_z_MASK = 0x20, // mask for EVEX.z related features
INS_OPTS_EVEX_em_zero = 1 << 5, // Embedded mask merges with zero
+
+ // One-bit: 0b0100_0000
+ INS_OPTS_EVEX_nd_MASK = 0x40, // mask for APX-EVEX.nd related features
+
+ INS_OPTS_EVEX_nd = 1 << 6, // NDD form for legacy instructions
+
+ // One-bit: 0b1000_0000
+ INS_OPTS_EVEX_nf_MASK = 0x80, // mask for APX-EVEX.nf related features
+
+ INS_OPTS_EVEX_nf = 1 << 7, // NDD form for legacy instructions
+
+ INS_OPTS_EVEX_NoApxPromotion = 1 << 8, // Do not promote to APX-EVEX
+
};
#elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index 24be0ef3527b6a..f4c5df821190af 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -58,26 +58,26 @@ INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE,
INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, Encoding_REX2)
INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, Encoding_REX2)
-INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit |Encoding_REX2)
-INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2)
-INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2)
+INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF)
+INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF)
// Multi-byte opcodes without modrm are represented in mixed endian fashion.
// See comment around quarter way through this file for more information.
INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, Encoding_REX2)
// id nm um mr mi rm a4 tt flags
-INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
// Does not affect the stack tracking in the emitter
INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2)
INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2)
@@ -99,25 +99,25 @@ INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE,
#endif
INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2)
-INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2)
-INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2)
-INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2)
-INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2)
-INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2)
-INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2)
-INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2)
-INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2)
-INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2)
-INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2)
-INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2)
-INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2)
-INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2)
-INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2)
-INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2)
+INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
+INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD)
INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
+INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NDD | INS_Flags_Has_NF)
// id nm um mr mi rm tt flags
@@ -125,25 +125,25 @@ INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE,
// as 2-operand instructions with the target register being implicit
// implicit_reg = op1*op2_icon
#define INSTMUL INST3
-INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
+INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
#ifdef TARGET_AMD64
-INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
-INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit)
+INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
+INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF)
#endif // TARGET_AMD64
@@ -593,11 +593,11 @@ INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BA
INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
// BMI1
-INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Logical AND NOT
-INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Bit Field Extract
-INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Extract Lowest Set Isolated Bit
-INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Get Mask Up to Lowest Set Bit
-INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Reset Lowest Set Bit
+INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Logical AND NOT
+INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Bit Field Extract
+INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Extract Lowest Set Isolated Bit
+INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Get Mask Up to Lowest Set Bit
+INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit
// BMI2
INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position
@@ -952,35 +952,43 @@ INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE,
// POPCNT
INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2)
+#if defined(TARGET_AMD64)
+INST3(tzcnt_apx, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F4, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Count the Number of Trailing Zero Bits
+INST3(lzcnt_apx, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F5, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF)
+INST3(popcnt_apx, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x000088, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | INS_Flags_Has_NF)
+#endif // TARGET_AMD64
+
+INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD)
+
+INST3(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, 0x0000D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, 0x0008D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+
+INST3(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, 0x0010D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, 0x0018D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, 0x0020D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, 0x0028D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, 0x0038D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+INST3(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF)
+
// id nm um mr mi flags
INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None)
INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None)
INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, Encoding_REX2)
-INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
-INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
-
// id nm um mr flags
INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit)
INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit)
@@ -1012,15 +1020,12 @@ INST1(leave, "leave", IUM_RD, 0x0000C9,
INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None)
-INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2)
-
INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None)
INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None)
-INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit)
-INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
-INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2)
+INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
+INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | INS_Flags_Has_NF)
+INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
+INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF)
INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF)
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 134d5ffc99ca00..ab8c6495027003 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -369,8 +369,9 @@ RELEASE_CONFIG_INTEGER(EnableMultiRegLocals, "EnableMultiRegLocals", 1)
RELEASE_CONFIG_INTEGER(JitNoInline, "JitNoInline", 0)
#if defined(DEBUG)
-CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for legacy instructions.
-CONFIG_INTEGER(JitBypassAPXCheck, "JitBypassAPXCheck", 0) // Bypass APX CPUID check.
+CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for compatible instructions.
+CONFIG_INTEGER(JitStressPromotedEvexEncoding, "JitStressPromotedEvexEncoding", 0) // Enable promoted EVEX encoding for
+ // compatible instructions.
#endif
// clang-format off
@@ -440,6 +441,7 @@ RELEASE_CONFIG_INTEGER(EnableArm64Sve, "EnableArm64Sve",
RELEASE_CONFIG_INTEGER(EnableEmbeddedBroadcast, "EnableEmbeddedBroadcast", 1) // Allows embedded broadcasts to be disabled
RELEASE_CONFIG_INTEGER(EnableEmbeddedMasking, "EnableEmbeddedMasking", 1) // Allows embedded masking to be disabled
+RELEASE_CONFIG_INTEGER(EnableApxNDD, "EnableApxNDD", 0) // Allows APX NDD feature to be disabled
// clang-format on
diff --git a/src/coreclr/jit/jiteh.cpp b/src/coreclr/jit/jiteh.cpp
index 60c830aad8d592..c833f2164fa0bc 100644
--- a/src/coreclr/jit/jiteh.cpp
+++ b/src/coreclr/jit/jiteh.cpp
@@ -2660,7 +2660,7 @@ bool Compiler::fgCreateFiltersForGenericExceptions()
filterBb->bbCodeOffs = handlerBb->bbCodeOffs;
filterBb->bbHndIndex = handlerBb->bbHndIndex;
filterBb->bbTryIndex = handlerBb->bbTryIndex;
- filterBb->bbSetRunRarely();
+ filterBb->inheritWeightPercentage(handlerBb, 0);
filterBb->SetFlags(BBF_INTERNAL | BBF_DONT_REMOVE);
handlerBb->bbCatchTyp = BBCT_FILTER_HANDLER;
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 724dd17082a4e3..3b022ca56c857b 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -511,6 +511,11 @@ GenTree* Lowering::LowerNode(GenTree* node)
}
#endif
break;
+ case GT_NOT:
+#ifdef TARGET_ARM64
+ ContainCheckNot(node->AsOp());
+#endif
+ break;
case GT_SELECT:
return LowerSelect(node->AsConditional());
@@ -1066,11 +1071,11 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
for (unsigned i = 0; i < jumpCnt - 1; ++i)
{
assert(currentBlock != nullptr);
- BasicBlock* const targetBlock = jumpTab[i]->getDestinationBlock();
// Remove the switch from the predecessor list of this case target's block.
// We'll add the proper new predecessor edge later.
- FlowEdge* const oldEdge = jumpTab[i];
+ FlowEdge* const oldEdge = jumpTab[i];
+ BasicBlock* const targetBlock = oldEdge->getDestinationBlock();
// Compute the likelihood that this test is successful.
// Divide by number of cases still sharing this edge (reduces likelihood)
@@ -1131,8 +1136,9 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
{
BasicBlock* const newBlock = comp->fgNewBBafter(BBJ_ALWAYS, currentBlock, true);
FlowEdge* const newEdge = comp->fgAddRefPred(newBlock, currentBlock);
- currentBlock = newBlock;
- currentBBRange = &LIR::AsRange(currentBlock);
+ newBlock->inheritWeight(currentBlock);
+ currentBlock = newBlock;
+ currentBBRange = &LIR::AsRange(currentBlock);
afterDefaultCondBlock->SetKindAndTargetEdge(BBJ_ALWAYS, newEdge);
}
@@ -1207,6 +1213,25 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
currentBlock->RemoveFlags(BBF_DONT_REMOVE);
comp->fgRemoveBlock(currentBlock, /* unreachable */ false); // It's an empty block.
}
+
+ // Update flow into switch targets
+ if (afterDefaultCondBlock->hasProfileWeight())
+ {
+ bool profileInconsistent = false;
+ for (unsigned i = 0; i < jumpCnt - 1; i++)
+ {
+ BasicBlock* const targetBlock = jumpTab[i]->getDestinationBlock();
+ targetBlock->setBBProfileWeight(targetBlock->computeIncomingWeight());
+ profileInconsistent |= (targetBlock->NumSucc() > 0);
+ }
+
+ if (profileInconsistent)
+ {
+ JITDUMP("Switch lowering: Flow out of " FMT_BB " needs to be propagated. Data %s inconsistent.\n",
+ afterDefaultCondBlock->bbNum, comp->fgPgoConsistent ? "is now" : "was already");
+ comp->fgPgoConsistent = false;
+ }
+ }
}
else
{
@@ -1260,11 +1285,28 @@ GenTree* Lowering::LowerSwitch(GenTree* node)
JITDUMP("Zero weight switch block " FMT_BB ", distributing likelihoods equally per case\n",
afterDefaultCondBlock->bbNum);
// jumpCnt-1 here because we peeled the default after copying this value.
- weight_t const newLikelihood = 1.0 / (jumpCnt - 1);
+ weight_t const newLikelihood = 1.0 / (jumpCnt - 1);
+ bool profileInconsistent = false;
for (unsigned i = 0; i < successors.numDistinctSuccs; i++)
{
- FlowEdge* const edge = successors.nonDuplicates[i];
+ FlowEdge* const edge = successors.nonDuplicates[i];
+ weight_t const oldEdgeWeight = edge->getLikelyWeight();
edge->setLikelihood(newLikelihood * edge->getDupCount());
+ weight_t const newEdgeWeight = edge->getLikelyWeight();
+
+ if (afterDefaultCondBlock->hasProfileWeight())
+ {
+ BasicBlock* const targetBlock = edge->getDestinationBlock();
+ targetBlock->increaseBBProfileWeight(newEdgeWeight - oldEdgeWeight);
+ profileInconsistent |= (targetBlock->NumSucc() > 0);
+ }
+ }
+
+ if (profileInconsistent)
+ {
+ JITDUMP("Switch lowering: Flow out of " FMT_BB " needs to be propagated. Data %s inconsistent.\n",
+ afterDefaultCondBlock->bbNum, comp->fgPgoConsistent ? "is now" : "was already");
+ comp->fgPgoConsistent = false;
}
}
else
@@ -1447,6 +1489,22 @@ bool Lowering::TryLowerSwitchToBitTest(FlowEdge* jumpTable[],
bbSwitch->SetCond(case1Edge, case0Edge);
+ //
+ // Update profile
+ //
+ if (bbSwitch->hasProfileWeight())
+ {
+ bbCase0->setBBProfileWeight(bbCase0->computeIncomingWeight());
+ bbCase1->setBBProfileWeight(bbCase1->computeIncomingWeight());
+
+ if ((bbCase0->NumSucc() > 0) || (bbCase1->NumSucc() > 0))
+ {
+ JITDUMP("TryLowerSwitchToBitTest: Flow out of " FMT_BB " needs to be propagated. Data %s inconsistent.\n",
+ bbSwitch->bbNum, comp->fgPgoConsistent ? "is now" : "was already");
+ comp->fgPgoConsistent = false;
+ }
+ }
+
var_types bitTableType = (bitCount <= (genTypeSize(TYP_INT) * 8)) ? TYP_INT : TYP_LONG;
GenTree* bitTableIcon = comp->gtNewIconNode(bitTable, bitTableType);
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index 659870c844cd73..7fa3aca9e98511 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -94,6 +94,7 @@ class Lowering final : public Phase
insCflags TruthifyingFlags(GenCondition cond);
void ContainCheckConditionalCompare(GenTreeCCMP* ccmp);
void ContainCheckNeg(GenTreeOp* neg);
+ void ContainCheckNot(GenTreeOp* notOp);
void TryLowerCnsIntCselToCinc(GenTreeOp* select, GenTree* cond);
void TryLowerCselToCSOp(GenTreeOp* select, GenTree* cond);
bool TryLowerAddSubToMulLongOp(GenTreeOp* op, GenTree** next);
diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
index 471ec849686e92..852d912a133e78 100644
--- a/src/coreclr/jit/lowerarmarch.cpp
+++ b/src/coreclr/jit/lowerarmarch.cpp
@@ -304,7 +304,7 @@ bool Lowering::IsContainableUnaryOrBinaryOp(GenTree* parentNode, GenTree* childN
}
}
- if (childNode->OperIs(GT_LSH, GT_RSH, GT_RSZ) && parentNode->OperIs(GT_AND_NOT))
+ if (childNode->OperIs(GT_LSH, GT_RSH, GT_RSZ) && parentNode->OperIs(GT_NOT, GT_AND_NOT))
{
return true;
}
@@ -3290,6 +3290,31 @@ void Lowering::ContainCheckNeg(GenTreeOp* neg)
}
}
+//------------------------------------------------------------------------
+// ContainCheckNot : determine whether the source of a not should be contained.
+//
+// Arguments:
+// notOp - pointer to the node
+//
+void Lowering::ContainCheckNot(GenTreeOp* notOp)
+{
+ if (notOp->isContained())
+ return;
+
+ if (!varTypeIsIntegral(notOp))
+ return;
+
+ if ((notOp->gtFlags & GTF_SET_FLAGS))
+ return;
+
+ GenTree* childNode = notOp->gtGetOp1();
+ if (comp->opts.OptimizationEnabled() && childNode->OperIs(GT_LSH, GT_RSH, GT_RSZ) &&
+ IsContainableUnaryOrBinaryOp(notOp, childNode))
+ {
+ MakeSrcContained(notOp, childNode);
+ }
+}
+
//----------------------------------------------------------------------------------------------
// TryLowerCselToCSOp: Try converting SELECT/SELECTCC to SELECT_?/SELECT_?CC. Conversion is possible only if
// one of the operands of the select node is one of GT_NEG, GT_NOT or GT_ADD.
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 31785e2e052e49..a787fece19fb6d 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -13533,7 +13533,7 @@ PhaseStatus Compiler::fgMorphBlocks()
if (!fgProfileWeightsConsistent(incomingWeight, fgEntryBB->bbWeight))
{
JITDUMP("OSR: Original method entry " FMT_BB " has inconsistent weight. Data %s inconsistent.\n",
- fgPgoConsistent ? "is now" : "was already");
+ fgEntryBB->bbNum, fgPgoConsistent ? "is now" : "was already");
fgPgoConsistent = false;
}
}
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index f427e3fc068d07..4642af455804a6 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2240,6 +2240,8 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block)
//
bNewCond->inheritWeight(block);
+ const weight_t totalWeight = bTest->bbWeight;
+
if (haveProfileWeights)
{
bTest->decreaseBBProfileWeight(block->bbWeight);
@@ -2300,6 +2302,15 @@ bool Compiler::optInvertWhileLoop(BasicBlock* block)
}
}
+ const weight_t loopWeight = bTest->bbWeight;
+ const weight_t nonLoopWeight = bNewCond->bbWeight;
+ if (haveProfileWeights && !fgProfileWeightsConsistent(totalWeight, loopWeight + nonLoopWeight))
+ {
+ JITDUMP("Redirecting flow from " FMT_BB " to " FMT_BB " introduced inconsistency. Data %s inconsistent.\n",
+ bTest->bbNum, bNewCond->bbNum, fgPgoConsistent ? "is now" : "was already");
+ fgPgoConsistent = false;
+ }
+
#ifdef DEBUG
if (verbose)
{
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Reflection/Runtime/Dispensers/DispenserThatReusesAsLongAsKeyIsAlive.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Reflection/Runtime/Dispensers/DispenserThatReusesAsLongAsKeyIsAlive.cs
index d0879b3cd0d9c2..18ba9d7880978d 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Reflection/Runtime/Dispensers/DispenserThatReusesAsLongAsKeyIsAlive.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Reflection/Runtime/Dispensers/DispenserThatReusesAsLongAsKeyIsAlive.cs
@@ -14,23 +14,16 @@ internal sealed class DispenserThatReusesAsLongAsKeyIsAlive factory)
{
- _createValueCallback = CreateValue;
_conditionalWeakTable = new ConditionalWeakTable();
_factory = factory;
}
public sealed override V GetOrAdd(K key)
{
- return _conditionalWeakTable.GetValue(key, _createValueCallback);
- }
-
- private V CreateValue(K key)
- {
- return _factory(key);
+ return _conditionalWeakTable.GetOrAdd(key, _factory);
}
private readonly Func _factory;
private readonly ConditionalWeakTable _conditionalWeakTable;
- private readonly ConditionalWeakTable.CreateValueCallback _createValueCallback;
}
}
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ComWrappers.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ComWrappers.NativeAot.cs
index 87eb31d58022fe..38105ec0984fdb 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ComWrappers.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ComWrappers.NativeAot.cs
@@ -701,6 +701,13 @@ public void DisconnectTracker()
}
}
+ // Custom type instead of a value tuple to avoid rooting 'ITuple' and other value tuple stuff
+ private struct GetOrCreateComInterfaceForObjectParameters
+ {
+ public ComWrappers? This;
+ public CreateComInterfaceFlags Flags;
+ }
+
///
/// Create a COM representation of the supplied object that can be passed to a non-managed environment.
///
@@ -716,18 +723,12 @@ public unsafe IntPtr GetOrCreateComInterfaceForObject(object instance, CreateCom
{
ArgumentNullException.ThrowIfNull(instance);
- ManagedObjectWrapperHolder? managedObjectWrapper;
- if (_managedObjectWrapperTable.TryGetValue(instance, out managedObjectWrapper))
+ ManagedObjectWrapperHolder managedObjectWrapper = _managedObjectWrapperTable.GetOrAdd(instance, static (c, items) =>
{
- managedObjectWrapper.AddRef();
- return managedObjectWrapper.ComIp;
- }
-
- managedObjectWrapper = _managedObjectWrapperTable.GetValue(instance, (c) =>
- {
- ManagedObjectWrapper* value = CreateManagedObjectWrapper(c, flags);
+ ManagedObjectWrapper* value = items.This!.CreateManagedObjectWrapper(c, items.Flags);
return new ManagedObjectWrapperHolder(value, c);
- });
+ }, new GetOrCreateComInterfaceForObjectParameters { This = this, Flags = flags });
+
managedObjectWrapper.AddRef();
return managedObjectWrapper.ComIp;
}
@@ -1069,15 +1070,11 @@ private void RegisterWrapperForObject(NativeObjectWrapper wrapper, object comPro
Debug.Assert(wrapper.ProxyHandle.Target == comProxy);
Debug.Assert(wrapper.IsUniqueInstance || _rcwCache.FindProxyForComInstance(wrapper.ExternalComObject) == comProxy);
- if (s_nativeObjectWrapperTable.TryGetValue(comProxy, out NativeObjectWrapper? registeredWrapper)
- && registeredWrapper != wrapper)
- {
- Debug.Assert(registeredWrapper.ExternalComObject != wrapper.ExternalComObject);
- wrapper.Release();
- throw new NotSupportedException();
- }
+ // Add the input wrapper bound to the COM proxy, if there isn't one already. If another thread raced
+ // against this one and this lost, we'd get the wrapper added from that thread instead.
+ NativeObjectWrapper registeredWrapper = s_nativeObjectWrapperTable.GetOrAdd(comProxy, wrapper);
- registeredWrapper = GetValueFromRcwTable(comProxy, wrapper);
+ // We lost the race, so we cannot register the incoming wrapper with the target object
if (registeredWrapper != wrapper)
{
Debug.Assert(registeredWrapper.ExternalComObject != wrapper.ExternalComObject);
@@ -1091,9 +1088,6 @@ private void RegisterWrapperForObject(NativeObjectWrapper wrapper, object comPro
// TrackerObjectManager and we could end up missing a section of the object graph.
// This cache deduplicates, so it is okay that the wrapper will be registered multiple times.
AddWrapperToReferenceTrackerHandleCache(registeredWrapper);
-
- // Separate out into a local function to avoid the closure and delegate allocation unless we need it.
- static NativeObjectWrapper GetValueFromRcwTable(object userObject, NativeObjectWrapper newWrapper) => s_nativeObjectWrapperTable.GetValue(userObject, _ => newWrapper);
}
private static void AddWrapperToReferenceTrackerHandleCache(NativeObjectWrapper wrapper)
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ObjectiveCMarshal.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ObjectiveCMarshal.NativeAot.cs
index ccb969f1728011..621c62d87db7d9 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ObjectiveCMarshal.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/ObjectiveCMarshal.NativeAot.cs
@@ -136,7 +136,7 @@ private static IntPtr CreateReferenceTrackingHandleInternal(
throw new InvalidOperationException(SR.InvalidOperation_ObjectiveCTypeNoFinalizer);
}
- var trackerInfo = s_objects.GetValue(obj, static o => new ObjcTrackingInformation());
+ var trackerInfo = s_objects.GetOrAdd(obj, static o => new ObjcTrackingInformation());
trackerInfo.EnsureInitialized(obj);
trackerInfo.GetTaggedMemory(out memInSizeT, out mem);
return RuntimeImports.RhHandleAllocRefCounted(obj);
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs
index 81c9482a557580..5ae29387480839 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs
@@ -70,7 +70,7 @@ public static unsafe IntPtr GetFunctionPointerForDelegate(Delegate del)
//
// Marshalling a managed delegate created from managed code into a native function pointer
//
- return GetPInvokeDelegates().GetValue(del, s_AllocateThunk ??= AllocateThunk).Thunk;
+ return GetPInvokeDelegates().GetOrAdd(del, s_AllocateThunk ??= AllocateThunk).Thunk;
}
}
@@ -78,7 +78,7 @@ public static unsafe IntPtr GetFunctionPointerForDelegate(Delegate del)
/// Used to lookup whether a delegate already has thunk allocated for it
///
private static ConditionalWeakTable s_pInvokeDelegates;
- private static ConditionalWeakTable.CreateValueCallback s_AllocateThunk;
+ private static Func s_AllocateThunk;
private static ConditionalWeakTable GetPInvokeDelegates()
{
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
index bed275c1ea995d..4c90bbaa42ad3e 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
@@ -25,14 +25,14 @@ public static partial class Monitor
#region Object->Lock/Condition mapping
private static readonly ConditionalWeakTable