diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index ccfc1e5fb8a742..972d21c3bcedf9 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -33,7 +33,6 @@ jobs:
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
-      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -51,9 +50,9 @@ jobs:
         id: vars
         run: |
           remote_repo='https://github.com/llvm/llvm-project'
-          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+          if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             major_version=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
-            baseline_ref="llvmorg-$major_version.0.0"
+            baseline_ref="llvmorg-$major_version.1.0"
 
             # If there is a minor release, we want to use that as the base line.
             minor_ref=$(git ls-remote --refs -t "$remote_repo" llvmorg-"$major_version".[1-9].[0-9] | tail -n1 | grep -o 'llvmorg-.\+' || true)
@@ -75,7 +74,7 @@ jobs:
           else
             {
               echo "BASELINE_VERSION_MAJOR=${{ steps.version.outputs.LLVM_VERSION_MAJOR }}"
-              echo "BASELINE_REF=llvmorg-${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.0.0"
+              echo "BASELINE_REF=llvmorg-${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.1.0"
               echo "ABI_HEADERS=."
               echo "ABI_LIBS=libclang.so libclang-cpp.so"
             } >> "$GITHUB_OUTPUT"
diff --git a/amd/comgr/test/CMakeLists.txt b/amd/comgr/test/CMakeLists.txt
index f0bf636fd3c7ce..39372323ed21e4 100644
--- a/amd/comgr/test/CMakeLists.txt
+++ b/amd/comgr/test/CMakeLists.txt
@@ -246,7 +246,6 @@ add_comgr_test(lookup_code_object_test c)
 add_comgr_test(symbolize_test c)
 add_comgr_test(mangled_names_test c)
 add_comgr_test(multithread_test cpp)
-add_comgr_test(name_expression_map_test c)
 add_comgr_test(nested_kernel_test c)
 add_comgr_test(map_elf_virtual_address_test c)
 add_comgr_test(compile_source_to_executable c)
@@ -264,4 +263,5 @@ if (DEFINED HIP_COMPILER AND "${HIP_COMPILER}" STREQUAL "clang")
   add_comgr_test(unbundle_hip_test c)
   add_comgr_test(compile_hip_to_relocatable c)
   add_comgr_test(mangled_names_hip_test c)
+  add_comgr_test(name_expression_map_test c)
 endif()
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
new file mode 100644
index 00000000000000..1951ad5a2dc5eb
--- /dev/null
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -0,0 +1,1213 @@
+# BOLT - a post-link optimizer developed to speed up large applications
+
+## SYNOPSIS
+
+`llvm-bolt <executable> [-o outputfile] <executable>.bolt [-data=perf.fdata] [options]`
+
+## OPTIONS
+
+### Generic options
+
+- `-h`
+
+  Alias for `--help`
+
+- `--help`
+
+  Display available options (`--help-hidden` for more).
+
+- `--help-hidden`
+
+  Display all available options.
+
+- `--help-list`
+
+  Display list of available options (`--help-list-hidden` for more).
+
+- `--help-list-hidden`
+
+  Display list of all available options.
+
+- `--print-all-options`
+
+  Print all option values after command line parsing.
+
+- `--print-options`
+
+  Print non-default options after command line parsing.
+
+- `--version`
+
+  Display the version of this program.
+
+### Output options
+
+- `-o <string>`
+
+  output file
+
+- `-w <string>`
+
+  Save recorded profile to a file
+
+### BOLT generic options
+
+- `--align-text=<uint>`
+
+  Alignment of .text section
+
+- `--allow-stripped`
+
+  Allow processing of stripped binaries
+
+- `--asm-dump[=<dump folder>]`
+
+  Dump function into assembly
+
+- `-b`
+
+  Alias for -data
+
+- `--bolt-id=<string>`
+
+  Add any string to tag this execution in the output binary via bolt info section
+
+- `--break-funcs=<func1,func2,func3,...>`
+
+  List of functions to core dump on (debugging)
+
+- `--check-encoding`
+
+  Perform verification of LLVM instruction encoding/decoding. Every instruction
+  in the input is decoded and re-encoded. If the resulting bytes do not match
+  the input, a warning message is printed.
+
+- `--cu-processing-batch-size=<uint>`
+
+  Specifies the size of batches for processing CUs. Higher number has better
+  performance, but more memory usage. Default value is 1.
+
+- `--data=<string>`
+
+  <data file>
+
+- `--debug-skeleton-cu`
+
+  Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
+
+- `--deterministic-debuginfo`
+
+  Disables parallel execution of tasks that may produce nondeterministic debug info
+
+- `--dot-tooltip-code`
+
+  Add basic block instructions as tool tips on nodes
+
+- `--dump-cg=<string>`
+
+  Dump callgraph to the given file
+
+- `--dump-data`
+
+  Dump parsed bolt data for debugging
+
+- `--dump-dot-all`
+
+  Dump function CFGs to graphviz format after each stage; enable '-print-loops'
+  for color-coded blocks
+
+- `--dump-orc`
+
+  Dump raw ORC unwind information (sorted)
+
+- `--dwarf-output-path=<string>`
+
+  Path to where .dwo files or dwp file will be written out to.
+
+- `--dwp=<string>`
+
+  Path and name to DWP file.
+
+- `--dyno-stats`
+
+  Print execution info based on profile
+
+- `--dyno-stats-all`
+
+  Print dyno stats after each stage
+
+- `--dyno-stats-scale=<uint>`
+
+  Scale to be applied while reporting dyno stats
+
+- `--enable-bat`
+
+  Write BOLT Address Translation tables
+
+- `--force-data-relocations`
+
+  Force relocations to data sections to always be processed
+
+- `--force-patch`
+
+  Force patching of original entry points
+
+- `--funcs=<func1,func2,func3,...>`
+
+  Limit optimizations to functions from the list
+
+- `--funcs-file=<string>`
+
+  File with list of functions to optimize
+
+- `--funcs-file-no-regex=<string>`
+
+  File with list of functions to optimize (non-regex)
+
+- `--funcs-no-regex=<func1,func2,func3,...>`
+
+  Limit optimizations to functions from the list (non-regex)
+
+- `--hot-data`
+
+  Hot data symbols support (relocation mode)
+
+- `--hot-functions-at-end`
+
+  If reorder-functions is used, order functions putting hottest last
+
+- `--hot-text`
+
+  Generate hot text symbols. Apply this option to a precompiled binary that
+  manually calls into hugify, such that at runtime hugify call will put hot
+  code into 2M pages. This requires relocation.
+
+- `--hot-text-move-sections=<sec1,sec2,sec3,...>`
+
+  List of sections containing functions used for hugifying hot text. BOLT makes
+  sure these functions are not placed on the same page as the hot text.
+  (default='.stub,.mover').
+
+- `--insert-retpolines`
+
+  Run retpoline insertion pass
+
+- `--keep-aranges`
+
+  Keep or generate .debug_aranges section if .gdb_index is written
+
+- `--keep-tmp`
+
+  Preserve intermediate .o file
+
+- `--lite`
+
+  Skip processing of cold functions
+
+- `--max-data-relocations=<uint>`
+
+  Maximum number of data relocations to process
+
+- `--max-funcs=<uint>`
+
+  Maximum number of functions to process
+
+- `--no-huge-pages`
+
+  Use regular size pages for code alignment
+
+- `--no-threads`
+
+  Disable multithreading
+
+- `--pad-funcs=<func1:pad1,func2:pad2,func3:pad3,...>`
+
+  List of functions to pad with amount of bytes
+
+- `--profile-format=<value>`
+
+  Format to dump profile output in aggregation mode, default is fdata
+  - `=fdata`: offset-based plaintext format
+  - `=yaml`: dense YAML representation
+
+- `--r11-availability=<value>`
+
+  Determine the availability of r11 before indirect branches
+  - `=never`: r11 not available
+  - `=always`: r11 available before calls and jumps
+  - `=abi`r11 available before calls but not before jumps
+
+- `--relocs`
+
+  Use relocations in the binary (default=autodetect)
+
+- `--remove-symtab`
+
+  Remove .symtab section
+
+- `--reorder-skip-symbols=<symbol1,symbol2,symbol3,...>`
+
+  List of symbol names that cannot be reordered
+
+- `--reorder-symbols=<symbol1,symbol2,symbol3,...>`
+
+  List of symbol names that can be reordered
+
+- `--retpoline-lfence`
+
+  Determine if lfence instruction should exist in the retpoline
+
+- `--skip-funcs=<func1,func2,func3,...>`
+
+  List of functions to skip
+
+- `--skip-funcs-file=<string>`
+
+  File with list of functions to skip
+
+- `--strict`
+
+  Trust the input to be from a well-formed source
+
+- `--tasks-per-thread=<uint>`
+
+  Number of tasks to be created per thread
+
+- `--thread-count=<uint>`
+
+  Number of threads
+
+- `--top-called-limit=<uint>`
+
+  Maximum number of functions to print in top called functions section
+
+- `--trap-avx512`
+
+  In relocation mode trap upon entry to any function that uses AVX-512 instructions
+
+- `--trap-old-code`
+
+  Insert traps in old function bodies (relocation mode)
+
+- `--update-debug-sections`
+
+  Update DWARF debug sections of the executable
+
+- `--use-gnu-stack`
+
+  Use GNU_STACK program header for new segment (workaround for issues with
+  strip/objcopy)
+
+- `--use-old-text`
+
+  Re-use space in old .text if possible (relocation mode)
+
+- `-v <uint>`
+
+  Set verbosity level for diagnostic output
+
+- `--write-dwp`
+
+  Output a single dwarf package file (dwp) instead of multiple non-relocatable
+  dwarf object files (dwo).
+
+### BOLT optimization options
+
+- `--align-blocks`
+
+  Align basic blocks
+
+- `--align-blocks-min-size=<uint>`
+
+  Minimal size of the basic block that should be aligned
+
+- `--align-blocks-threshold=<uint>`
+
+  Align only blocks with frequency larger than containing function execution
+  frequency specified in percent. E.g. 1000 means aligning blocks that are 10
+  times more frequently executed than the containing function.
+
+- `--align-functions=<uint>`
+
+  Align functions at a given value (relocation mode)
+
+- `--align-functions-max-bytes=<uint>`
+
+  Maximum number of bytes to use to align functions
+
+- `--assume-abi`
+
+  Assume the ABI is never violated
+
+- `--block-alignment=<uint>`
+
+  Boundary to use for alignment of basic blocks
+
+- `--bolt-seed=<uint>`
+
+  Seed for randomization
+
+- `--cg-from-perf-data`
+
+  Use perf data directly when constructing the call graph for stale functions
+
+- `--cg-ignore-recursive-calls`
+
+  Ignore recursive calls when constructing the call graph
+
+- `--cg-use-split-hot-size`
+
+  Use hot/cold data on basic blocks to determine hot sizes for call graph functions
+
+- `--cold-threshold=<uint>`
+
+  Tenths of percents of main entry frequency to use as a threshold when
+  evaluating whether a basic block is cold (0 means it is only considered
+  cold if the block has zero samples). Default: 0
+
+- `--elim-link-veneers`
+
+  Run veneer elimination pass
+
+- `--eliminate-unreachable`
+
+  Eliminate unreachable code
+
+- `--equalize-bb-counts`
+
+  Use same count for BBs that should have equivalent count (used in non-LBR
+  and shrink wrapping)
+
+- `--execution-count-threshold=<uint>`
+
+  Perform profiling accuracy-sensitive optimizations only if function execution
+  count >= the threshold (default: 0)
+
+- `--fix-block-counts`
+
+  Adjust block counts based on outgoing branch counts
+
+- `--fix-func-counts`
+
+  Adjust function counts based on basic blocks execution count
+
+- `--force-inline=<func1,func2,func3,...>`
+
+  List of functions to always consider for inlining
+
+- `--frame-opt=<value>`
+
+  Optimize stack frame accesses
+  - `none`: do not perform frame optimization
+  - `hot`: perform FOP on hot functions
+  - `all`: perform FOP on all functions
+
+- `--frame-opt-rm-stores`
+
+  Apply additional analysis to remove stores (experimental)
+
+- `--function-order=<string>`
+
+  File containing an ordered list of functions to use for function reordering
+
+- `--generate-function-order=<string>`
+
+  File to dump the ordered list of functions to use for function reordering
+
+- `--generate-link-sections=<string>`
+
+  Generate a list of function sections in a format suitable for inclusion in a
+  linker script
+
+- `--group-stubs`
+
+  Share stubs across functions
+
+- `--hugify`
+
+  Automatically put hot code on 2MB page(s) (hugify) at runtime. No manual call
+  to hugify is needed in the binary (which is what --hot-text relies on).
+
+- `--icf`
+
+  Fold functions with identical code
+
+- `--icp`
+
+  Alias for --indirect-call-promotion
+
+- `--icp-calls-remaining-percent-threshold=<uint>`
+
+  The percentage threshold against remaining unpromoted indirect call count
+  for the promotion for calls
+
+- `--icp-calls-topn`
+
+  Alias for --indirect-call-promotion-calls-topn
+
+- `--icp-calls-total-percent-threshold=<uint>`
+
+  The percentage threshold against total count for the promotion for calls
+
+- `--icp-eliminate-loads`
+
+  Enable load elimination using memory profiling data when performing ICP
+
+- `--icp-funcs=<func1,func2,func3,...>`
+
+  List of functions to enable ICP for
+
+- `--icp-inline`
+
+  Only promote call targets eligible for inlining
+
+- `--icp-jt-remaining-percent-threshold=<uint>`
+
+  The percentage threshold against remaining unpromoted indirect call count for
+  the promotion for jump tables
+
+- `--icp-jt-targets`
+
+  Alias for --icp-jump-tables-targets
+
+- `--icp-jt-topn`
+
+  Alias for --indirect-call-promotion-jump-tables-topn
+
+- `--icp-jt-total-percent-threshold=<uint>`
+
+  The percentage threshold against total count for the promotion for jump tables
+
+- `--icp-jump-tables-targets`
+
+  For jump tables, optimize indirect jmp targets instead of indices
+
+- `--icp-mp-threshold`
+
+  Alias for --indirect-call-promotion-mispredict-threshold
+
+- `--icp-old-code-sequence`
+
+  Use old code sequence for promoted calls
+
+- `--icp-top-callsites=<uint>`
+
+  Optimize hottest calls until at least this percentage of all indirect calls
+  frequency is covered. 0 = all callsites
+
+- `--icp-topn`
+
+  Alias for --indirect-call-promotion-topn
+
+- `--icp-use-mp`
+
+  Alias for --indirect-call-promotion-use-mispredicts
+
+- `--indirect-call-promotion=<value>`
+
+  Indirect call promotion
+  - `none`: do not perform indirect call promotion
+  - `calls`: perform ICP on indirect calls
+  - `jump-tables`: perform ICP on jump tables
+  - `all`: perform ICP on calls and jump tables
+
+- `--indirect-call-promotion-calls-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion on
+  calls. 0 = no limit
+
+- `--indirect-call-promotion-jump-tables-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion on
+  jump tables. 0 = no limit
+
+- `--indirect-call-promotion-mispredict-threshold=<uint>`
+
+  Misprediction threshold for skipping ICP on an indirect call
+
+- `--indirect-call-promotion-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion.
+  0 = no limit
+
+- `--indirect-call-promotion-use-mispredicts`
+
+  Use misprediction frequency for determining whether or not ICP should be
+  applied at a callsite. The `-indirect-call-promotion-mispredict-threshold`
+  value will be used by this heuristic
+
+- `--infer-fall-throughs`
+
+  Infer execution count for fall-through blocks
+
+- `--infer-stale-profile`
+
+  Infer counts from stale profile data.
+
+- `--inline-all`
+
+  Inline all functions
+
+- `--inline-ap`
+
+  Adjust function profile after inlining
+
+- `--inline-limit=<uint>`
+
+  Maximum number of call sites to inline
+
+- `--inline-max-iters=<uint>`
+
+  Maximum number of inline iterations
+
+- `--inline-memcpy`
+
+  Inline memcpy using 'rep movsb' instruction (X86-only)
+
+- `--inline-small-functions`
+
+  Inline functions if increase in size is less than defined by `-inline-small-functions-bytes`
+
+- `--inline-small-functions-bytes=<uint>`
+
+  Max number of bytes for the function to be considered small for inlining purposes
+
+- `--instrument`
+
+  Instrument code to generate accurate profile data
+
+- `--iterative-guess`
+
+  In non-LBR mode, guess edge counts using iterative technique
+
+- `--jt-footprint-optimize-for-icache`
+
+  With jt-footprint-reduction, only process PIC jumptables and turn off other
+  transformations that increase code size
+
+- `--jt-footprint-reduction`
+
+  Make jump tables size smaller at the cost of using more instructions at jump
+  sites
+
+- `-jump-tables=<value>`
+
+  Jump tables support (default=basic)
+  - `none`: do not optimize functions with jump tables
+  - `basic`: optimize functions with jump tables
+  - `move`: move jump tables to a separate section
+  - `split`: split jump tables section into hot and cold based on function
+  execution frequency
+  - `aggressive`: aggressively split jump tables section based on usage of the
+  tables
+
+- `--keep-nops`
+
+  Keep no-op instructions. By default they are removed.
+
+- `--lite-threshold-count=<uint>`
+
+  Similar to '-lite-threshold-pct' but specify threshold using absolute function
+  call count. I.e. limit processing to functions executed at least the specified
+  number of times.
+
+- `--lite-threshold-pct=<uint>`
+
+  Threshold (in percent) for selecting functions to process in lite mode. Higher
+  threshold means fewer functions to process. E.g threshold of 90 means only top
+  10 percent of functions with profile will be processed.
+
+- `--mcf-use-rarcs`
+
+  In MCF, consider the possibility of cancelling flow to balance edges
+
+- `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
+
+  List of functions with call sites for which to specialize memcpy() for size 1
+
+- `--min-branch-clusters`
+
+  Use a modified clustering algorithm geared towards minimizing branches
+
+- `--no-inline`
+
+  Disable all inlining (overrides other inlining options)
+
+- `--no-scan`
+
+  Do not scan cold functions for external references (may result in slower binary)
+
+- `--peepholes=<value>`
+
+  Enable peephole optimizations
+  - `none`: disable peepholes
+  - `double-jumps`: remove double jumps when able
+  - `tailcall-traps`: insert tail call traps
+  - `useless-branches`: remove useless conditional branches
+  - `all`: enable all peephole optimizations
+
+- `--plt=<value>`
+
+  Optimize PLT calls (requires linking with -znow)
+  - `none`: do not optimize PLT calls
+  - `hot`: optimize executed (hot) PLT calls
+  - `all`: optimize all PLT calls
+
+- `--preserve-blocks-alignment`
+
+  Try to preserve basic block alignment
+
+- `--profile-ignore-hash`
+
+  Ignore hash while reading function profile
+
+- `--profile-use-dfs`
+
+  Use DFS order for YAML profile
+
+- `--reg-reassign`
+
+  Reassign registers so as to avoid using REX prefixes in hot code
+
+- `--reorder-blocks=<value>`
+
+  Change layout of basic blocks in a function
+  - `none`: do not reorder basic blocks
+  - `reverse`: layout blocks in reverse order
+  - `normal`: perform optimal layout based on profile
+  - `branch-predictor`: perform optimal layout prioritizing branch predictions
+  - `cache`: perform optimal layout prioritizing I-cache behavior
+  - `cache+`: perform layout optimizing I-cache behavior
+  - `ext-tsp`: perform layout optimizing I-cache behavior
+  - `cluster-shuffle`: perform random layout of clusters
+
+- `--reorder-data=<section1,section2,section3,...>`
+
+  List of sections to reorder
+
+- `--reorder-data-algo=<value>`
+
+  Algorithm used to reorder data sections
+  - `count`: sort hot data by read counts
+  - `funcs`: sort hot data by hot function usage and count
+
+- `--reorder-data-inplace`
+
+  Reorder data sections in place
+
+- `--reorder-data-max-bytes=<uint>`
+
+  Maximum number of bytes to reorder
+
+- `--reorder-data-max-symbols=<uint>`
+
+  Maximum number of symbols to reorder
+
+- `--reorder-functions=<value>`
+
+  Reorder and cluster functions (works only with relocations)
+  - `none`: do not reorder functions
+  - `exec-count`: order by execution count
+  - `hfsort`: use hfsort algorithm
+  - `hfsort+`: use hfsort+ algorithm
+  - `cdsort`: use cache-directed sort
+  - `pettis-hansen`: use Pettis-Hansen algorithm
+  - `random`: reorder functions randomly
+  - `user`: use function order specified by -function-order
+
+- `--reorder-functions-use-hot-size`
+
+  Use a function's hot size when doing clustering
+
+- `--report-bad-layout=<uint>`
+
+  Print top <uint> functions with suboptimal code layout on input
+
+- `--report-stale`
+
+  Print the list of functions with stale profile
+
+- `--runtime-hugify-lib=<string>`
+
+  Specify file name of the runtime hugify library
+
+- `--runtime-instrumentation-lib=<string>`
+
+  Specify file name of the runtime instrumentation library
+
+- `--sctc-mode=<value>`
+
+  Mode for simplify conditional tail calls
+  - `always`: always perform sctc
+  - `preserve`: only perform sctc when branch direction is preserved
+  - `heuristic`: use branch prediction data to control sctc
+
+- `--sequential-disassembly`
+
+  Performs disassembly sequentially
+
+- `--shrink-wrapping-threshold=<uint>`
+
+  Percentage of prologue execution count to use as threshold when evaluating
+  whether a block is cold enough to be profitable to move eligible spills there
+
+- `--simplify-conditional-tail-calls`
+
+  Simplify conditional tail calls by removing unnecessary jumps
+
+- `--simplify-rodata-loads`
+
+  Simplify loads from read-only sections by replacing the memory operand with
+  the constant found in the corresponding section
+
+- `--split-align-threshold=<uint>`
+
+  When deciding to split a function, apply this alignment while doing the size
+  comparison (see -split-threshold). Default value: 2.
+
+- `--split-all-cold`
+
+  Outline as many cold basic blocks as possible
+
+- `--split-eh`
+
+  Split C++ exception handling code
+
+- `--split-functions`
+
+  Split functions into fragments
+
+- `--split-strategy=<value>`
+
+  Strategy used to partition blocks into fragments
+
+  - `profile2`: split each function into a hot and cold fragment using
+  profiling information
+  - `cdsplit`: split each function into a hot, warm, and cold fragment using
+  profiling information
+  - `random2`: split each function into a hot and cold fragment at a randomly
+  chosen split point (ignoring any available profiling information)
+  - `randomN`: split each function into N fragments at randomly chosen split
+  points (ignoring any available profiling information)
+  - `all`: split all basic blocks of each function into fragments such that
+  each fragment contains exactly a single basic block
+
+- `--split-threshold=<uint>`
+
+  Split function only if its main size is reduced by more than given amount of
+  bytes. Default value: 0, i.e. split iff the size is reduced. Note that on
+  some architectures the size can increase after splitting.
+
+- `--stale-matching-max-func-size=<uint>`
+
+  The maximum size of a function to consider for inference.
+
+- `--stale-threshold=<uint>`
+
+  Maximum percentage of stale functions to tolerate (default: 100)
+
+- `--stoke`
+
+  Turn on the stoke analysis
+
+- `--strip-rep-ret`
+
+  Strip 'repz' prefix from 'repz retq' sequence (on by default)
+
+- `--tail-duplication=<value>`
+
+  Duplicate unconditional branches that cross a cache line
+
+  - `none` do not apply
+  - `aggressive` aggressive strategy
+  - `moderate` moderate strategy
+  - `cache` cache-aware duplication strategy
+
+- `--tsp-threshold=<uint>`
+
+  Maximum number of hot basic blocks in a function for which to use a precise TSP solution while re-ordering basic blocks
+
+- `--use-aggr-reg-reassign`
+
+  Use register liveness analysis to try to find more opportunities for -reg-reassign optimization
+
+- `--use-compact-aligner`
+
+  Use compact approach for aligning functions
+
+- `--use-edge-counts`
+
+  Use edge count data when doing clustering
+
+- `--verify-cfg`
+
+  Verify the CFG after every pass
+
+- `--x86-align-branch-boundary-hot-only`
+
+  Only apply branch boundary alignment in hot code
+
+- `--x86-strip-redundant-address-size`
+
+  Remove redundant Address-Size override prefix
+
+### BOLT options in relocation mode
+
+- `-align-macro-fusion=<value>`
+
+  Fix instruction alignment for macro-fusion (x86 relocation mode)
+
+  - `none`: do not insert alignment no-ops for macro-fusion
+  - `hot`: only insert alignment no-ops on hot execution paths (default)
+  - `all`: always align instructions to allow macro-fusion
+
+### BOLT instrumentation options
+
+`llvm-bolt <executable> -instrument [-o outputfile] <instrumented-executable>`
+
+- `--conservative-instrumentation`
+
+  Disable instrumentation optimizations that sacrifice profile accuracy (for
+  debugging, default: false)
+
+- `--instrument-calls`
+
+  Record profile for inter-function control flow activity (default: true)
+
+- `--instrument-hot-only`
+
+  Only insert instrumentation on hot functions (needs profile, default: false)
+
+- `--instrumentation-binpath=<string>`
+
+  Path to instrumented binary in case if /proc/self/map_files is not accessible
+  due to access restriction issues
+
+- `--instrumentation-file=<string>`
+
+  File name where instrumented profile will be saved (default: /tmp/prof.fdata)
+
+- `--instrumentation-file-append-pid`
+
+  Append PID to saved profile file name (default: false)
+
+- `--instrumentation-no-counters-clear`
+
+  Don't clear counters across dumps (use with `instrumentation-sleep-time` option)
+
+- `--instrumentation-sleep-time=<uint>`
+
+  Interval between profile writes (default: 0 = write only at program end).
+  This is useful for service workloads when you want to dump profile every X
+  minutes or if you are killing the program and the profile is not being
+  dumped at the end.
+
+- `--instrumentation-wait-forks`
+
+  Wait until all forks of instrumented process will finish (use with
+  `instrumentation-sleep-time` option)
+
+### Data aggregation options (perf2bolt)
+
+`perf2bolt -p perf.data [-o outputfile] perf.fdata <executable>`
+
+- `--autofdo`
+
+  Generate autofdo textual data instead of bolt data
+
+- `--filter-mem-profile`
+
+  If processing a memory profile, filter out stack or heap accesses that won't
+  be useful for BOLT to reduce profile file size
+
+- `--ignore-build-id`
+
+  Continue even if build-ids in input binary and perf.data mismatch
+
+- `--ignore-interrupt-lbr`
+
+  Ignore kernel interrupt LBR that happens asynchronously
+
+- `--itrace=<string>`
+
+  Generate LBR info with perf itrace argument
+
+- `--nl`
+
+  Aggregate basic samples (without LBR info)
+
+- `--pa`
+
+  Skip perf and read data from a pre-aggregated file format
+
+- `--perfdata=<string>`
+
+  Data file
+
+- `--pid=<ulong>`
+
+  Only use samples from process with specified PID
+
+- `--time-aggr`
+
+  Time BOLT aggregator
+
+- `--use-event-pc`
+
+  Use event PC in combination with LBR sampling
+
+### BOLT printing options
+
+#### Generic options
+
+- `--print-aliases`
+
+  Print aliases when printing objects
+
+- `--print-all`
+
+  Print functions after each stage
+
+- `--print-cfg`
+
+  Print functions after CFG construction
+
+- `--print-debug-info`
+
+  Print debug info when printing functions
+
+- `--print-disasm`
+
+  Print function after disassembly
+
+- `--print-dyno-opcode-stats=<uint>`
+
+  Print per instruction opcode dyno stats and the functionnames:BB offsets of
+  the nth highest execution counts
+
+- `--print-dyno-stats-only`
+
+  While printing functions output dyno-stats and skip instructions
+
+- `--print-exceptions`
+
+  Print exception handling data
+
+- `--print-globals`
+
+  Print global symbols after disassembly
+
+- `--print-jump-tables`
+
+  Print jump tables
+
+- `--print-loops`
+
+  Print loop related information
+
+- `--print-mem-data`
+
+  Print memory data annotations when printing functions
+
+- `--print-normalized`
+
+  Print functions after CFG is normalized
+
+- `--print-only=<func1,func2,func3,...>`
+
+  List of functions to print
+
+- `--print-orc`
+
+  Print ORC unwind information for instructions
+
+- `--print-profile`
+
+  Print functions after attaching profile
+
+- `--print-profile-stats`
+
+  Print profile quality/bias analysis
+
+- `--print-pseudo-probes=<value>`
+
+  Print pseudo probe info
+  - `=decode`: decode probes section from binary
+  - `=address_conversion`: update address2ProbesMap with output block address
+  - `=encoded_probes`: display the encoded probes in binary section
+  - `=all`: enable all debugging printout
+
+- `--print-relocations`
+
+  Print relocations when printing functions/objects
+
+- `--print-reordered-data`
+
+  Print section contents after reordering
+
+- `--print-retpoline-insertion`
+
+  Print functions after retpoline insertion pass
+
+- `--print-sdt`
+
+  Print all SDT markers
+
+- `--print-sections`
+
+  Print all registered sections
+
+- `--print-unknown`
+
+  Print names of functions with unknown control flow
+
+- `--time-opts`
+
+  Print time spent in each optimization
+
+#### Optimization options
+
+- `--print-after-branch-fixup`
+
+  Print function after fixing local branches
+
+- `--print-after-jt-footprint-reduction`
+
+  Print function after jt-footprint-reduction pass
+
+- `--print-after-lowering`
+
+  Print function after instruction lowering
+
+- `--print-cache-metrics`
+
+  Calculate and print various metrics for instruction cache
+
+- `--print-clusters`
+
+  Print clusters
+
+- `--print-finalized`
+
+  Print function after CFG is finalized
+
+- `--print-fix-relaxations`
+
+  Print functions after fix relaxations pass
+
+- `--print-fix-riscv-calls`
+
+  Print functions after fix RISCV calls pass
+
+- `--print-fop`
+
+  Print functions after frame optimizer pass
+
+- `--print-function-statistics=<uint>`
+
+  Print statistics about basic block ordering
+
+- `--print-icf`
+
+  Print functions after ICF optimization
+
+- `--print-icp`
+
+  Print functions after indirect call promotion
+
+- `--print-inline`
+
+  Print functions after inlining optimization
+
+- `--print-longjmp`
+
+  Print functions after longjmp pass
+
+- `--print-optimize-bodyless`
+
+  Print functions after bodyless optimization
+
+- `--print-output-address-range`
+
+  Print output address range for each basic block in the function
+  whenBinaryFunction::print is called
+
+- `--print-peepholes`
+
+  Print functions after peephole optimization
+
+- `--print-plt`
+
+  Print functions after PLT optimization
+
+- `--print-regreassign`
+
+  Print functions after regreassign pass
+
+- `--print-reordered`
+
+  Print functions after layout optimization
+
+- `--print-reordered-functions`
+
+  Print functions after clustering
+
+- `--print-sctc`
+
+  Print functions after conditional tail call simplification
+
+- `--print-simplify-rodata-loads`
+
+  Print functions after simplification of RO data loads
+
+- `--print-sorted-by=<value>`
+
+  Print functions sorted by order of dyno stats
+  - `executed-forward-branches`: executed forward branches
+  - `taken-forward-branches`: taken forward branches
+  - `executed-backward-branches`: executed backward branches
+  - `taken-backward-branches`: taken backward branches
+  - `executed-unconditional-branches`: executed unconditional branches
+  - `all-function-calls`: all function calls
+  - `indirect-calls`: indirect calls
+  - `PLT-calls`: PLT calls
+  - `executed-instructions`: executed instructions
+  - `executed-load-instructions`: executed load instructions
+  - `executed-store-instructions`: executed store instructions
+  - `taken-jump-table-branches`: taken jump table branches
+  - `taken-unknown-indirect-branches`: taken unknown indirect branches
+  - `total-branches`: total branches
+  - `taken-branches`: taken branches
+  - `non-taken-conditional-branches`: non-taken conditional branches
+  - `taken-conditional-branches`: taken conditional branches
+  - `all-conditional-branches`: all conditional branches
+  - `linker-inserted-veneer-calls`: linker-inserted veneer calls
+  - `all`: sorted by all names
+
+- `--print-sorted-by-order=<value>`
+
+  Use ascending or descending order when printing functions ordered by dyno stats
+
+- `--print-split`
+
+  Print functions after code splitting
+
+- `--print-stoke`
+
+  Print functions after stoke analysis
+
+- `--print-uce`
+
+  Print functions after unreachable code elimination
+
+- `--print-veneer-elimination`
+
+  Print functions after veneer elimination pass
+
+- `--time-build`
+
+  Print time spent constructing binary functions
+
+- `--time-rewrite`
+
+  Print time spent in rewriting passes
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 6890f52e2b28bb..578f1763bfe4ec 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  Core
   DebugInfoDWARF
   DWP
   JITLink
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
index ca1ae551cc632a..2fca7ae2e7eee8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
@@ -171,8 +171,7 @@ void SuspiciousEnumUsageCheck::check(const MatchFinder::MatchResult &Result) {
     // Skip when one of the parameters is an empty enum. The
     // hasDisjointValueRange function could not decide the values properly in
     // case of an empty enum.
-    if (EnumDec->enumerator_begin() == EnumDec->enumerator_end() ||
-        OtherEnumDec->enumerator_begin() == OtherEnumDec->enumerator_end())
+    if (EnumDec->enumerators().empty() || OtherEnumDec->enumerators().empty())
       return;
 
     if (!hasDisjointValueRange(EnumDec, OtherEnumDec))
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
index 00dfa17a1ccf61..5dee7f91a93410 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
@@ -67,8 +67,7 @@ static std::string createReplacementText(const LambdaExpr *Lambda) {
       AppendName("this");
     }
   }
-  if (!Replacement.empty() &&
-      Lambda->explicit_capture_begin() != Lambda->explicit_capture_end()) {
+  if (!Replacement.empty() && !Lambda->explicit_captures().empty()) {
     // Add back separator if we are adding explicit capture variables.
     Stream << ", ";
   }
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
index d2117c67a76d0b..ed76ac665049d1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
@@ -25,7 +25,9 @@ SpecialMemberFunctionsCheck::SpecialMemberFunctionsCheck(
                                          "AllowMissingMoveFunctions", false)),
       AllowSoleDefaultDtor(Options.get("AllowSoleDefaultDtor", false)),
       AllowMissingMoveFunctionsWhenCopyIsDeleted(
-          Options.get("AllowMissingMoveFunctionsWhenCopyIsDeleted", false)) {}
+          Options.get("AllowMissingMoveFunctionsWhenCopyIsDeleted", false)),
+      AllowImplicitlyDeletedCopyOrMove(
+          Options.get("AllowImplicitlyDeletedCopyOrMove", false)) {}
 
 void SpecialMemberFunctionsCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
@@ -33,17 +35,34 @@ void SpecialMemberFunctionsCheck::storeOptions(
   Options.store(Opts, "AllowSoleDefaultDtor", AllowSoleDefaultDtor);
   Options.store(Opts, "AllowMissingMoveFunctionsWhenCopyIsDeleted",
                 AllowMissingMoveFunctionsWhenCopyIsDeleted);
+  Options.store(Opts, "AllowImplicitlyDeletedCopyOrMove",
+                AllowImplicitlyDeletedCopyOrMove);
+}
+
+std::optional<TraversalKind>
+SpecialMemberFunctionsCheck::getCheckTraversalKind() const {
+  return AllowImplicitlyDeletedCopyOrMove ? TK_AsIs
+                                          : TK_IgnoreUnlessSpelledInSource;
 }
 
 void SpecialMemberFunctionsCheck::registerMatchers(MatchFinder *Finder) {
+  auto IsNotImplicitOrDeleted = anyOf(unless(isImplicit()), isDeleted());
+
   Finder->addMatcher(
       cxxRecordDecl(
-          eachOf(has(cxxDestructorDecl().bind("dtor")),
-                 has(cxxConstructorDecl(isCopyConstructor()).bind("copy-ctor")),
-                 has(cxxMethodDecl(isCopyAssignmentOperator())
+          unless(isImplicit()),
+          eachOf(has(cxxDestructorDecl(unless(isImplicit())).bind("dtor")),
+                 has(cxxConstructorDecl(isCopyConstructor(),
+                                        IsNotImplicitOrDeleted)
+                         .bind("copy-ctor")),
+                 has(cxxMethodDecl(isCopyAssignmentOperator(),
+                                   IsNotImplicitOrDeleted)
                          .bind("copy-assign")),
-                 has(cxxConstructorDecl(isMoveConstructor()).bind("move-ctor")),
-                 has(cxxMethodDecl(isMoveAssignmentOperator())
+                 has(cxxConstructorDecl(isMoveConstructor(),
+                                        IsNotImplicitOrDeleted)
+                         .bind("move-ctor")),
+                 has(cxxMethodDecl(isMoveAssignmentOperator(),
+                                   IsNotImplicitOrDeleted)
                          .bind("move-assign"))))
           .bind("class-def"),
       this);
@@ -127,7 +146,8 @@ void SpecialMemberFunctionsCheck::check(
   for (const auto &KV : Matchers)
     if (const auto *MethodDecl =
             Result.Nodes.getNodeAs<CXXMethodDecl>(KV.first)) {
-      StoreMember({KV.second, MethodDecl->isDeleted()});
+      StoreMember(
+          {KV.second, MethodDecl->isDeleted(), MethodDecl->isImplicit()});
     }
 }
 
@@ -144,7 +164,13 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
 
   auto HasMember = [&](SpecialMemberFunctionKind Kind) {
     return llvm::any_of(DefinedMembers, [Kind](const auto &Data) {
-      return Data.FunctionKind == Kind;
+      return Data.FunctionKind == Kind && !Data.IsImplicit;
+    });
+  };
+
+  auto HasImplicitDeletedMember = [&](SpecialMemberFunctionKind Kind) {
+    return llvm::any_of(DefinedMembers, [Kind](const auto &Data) {
+      return Data.FunctionKind == Kind && Data.IsImplicit && Data.IsDeleted;
     });
   };
 
@@ -154,9 +180,17 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
     });
   };
 
-  auto RequireMember = [&](SpecialMemberFunctionKind Kind) {
-    if (!HasMember(Kind))
-      MissingMembers.push_back(Kind);
+  auto RequireMembers = [&](SpecialMemberFunctionKind Kind1,
+                            SpecialMemberFunctionKind Kind2) {
+    if (AllowImplicitlyDeletedCopyOrMove && HasImplicitDeletedMember(Kind1) &&
+        HasImplicitDeletedMember(Kind2))
+      return;
+
+    if (!HasMember(Kind1))
+      MissingMembers.push_back(Kind1);
+
+    if (!HasMember(Kind2))
+      MissingMembers.push_back(Kind2);
   };
 
   bool RequireThree =
@@ -180,8 +214,8 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
         !HasMember(SpecialMemberFunctionKind::NonDefaultDestructor))
       MissingMembers.push_back(SpecialMemberFunctionKind::Destructor);
 
-    RequireMember(SpecialMemberFunctionKind::CopyConstructor);
-    RequireMember(SpecialMemberFunctionKind::CopyAssignment);
+    RequireMembers(SpecialMemberFunctionKind::CopyConstructor,
+                   SpecialMemberFunctionKind::CopyAssignment);
   }
 
   if (RequireFive &&
@@ -189,14 +223,16 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
         (IsDeleted(SpecialMemberFunctionKind::CopyConstructor) &&
          IsDeleted(SpecialMemberFunctionKind::CopyAssignment)))) {
     assert(RequireThree);
-    RequireMember(SpecialMemberFunctionKind::MoveConstructor);
-    RequireMember(SpecialMemberFunctionKind::MoveAssignment);
+    RequireMembers(SpecialMemberFunctionKind::MoveConstructor,
+                   SpecialMemberFunctionKind::MoveAssignment);
   }
 
   if (!MissingMembers.empty()) {
     llvm::SmallVector<SpecialMemberFunctionKind, 5> DefinedMemberKinds;
-    llvm::transform(DefinedMembers, std::back_inserter(DefinedMemberKinds),
-                    [](const auto &Data) { return Data.FunctionKind; });
+    for (const auto &Data : DefinedMembers) {
+      if (!Data.IsImplicit)
+        DefinedMemberKinds.push_back(Data.FunctionKind);
+    }
     diag(ID.first, "class '%0' defines %1 but does not define %2")
         << ID.second << cppcoreguidelines::join(DefinedMemberKinds, " and ")
         << cppcoreguidelines::join(MissingMembers, " or ");
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
index 6042f0fd6cb054..9ebc03ed2fa139 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
@@ -30,9 +30,8 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   void onEndOfTranslationUnit() override;
-  std::optional<TraversalKind> getCheckTraversalKind() const override {
-    return TK_IgnoreUnlessSpelledInSource;
-  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
   enum class SpecialMemberFunctionKind : uint8_t {
     Destructor,
     DefaultDestructor,
@@ -46,6 +45,7 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   struct SpecialMemberFunctionData {
     SpecialMemberFunctionKind FunctionKind;
     bool IsDeleted;
+    bool IsImplicit = false;
 
     bool operator==(const SpecialMemberFunctionData &Other) const {
       return (Other.FunctionKind == FunctionKind) &&
@@ -67,6 +67,7 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   const bool AllowMissingMoveFunctions;
   const bool AllowSoleDefaultDtor;
   const bool AllowMissingMoveFunctionsWhenCopyIsDeleted;
+  const bool AllowImplicitlyDeletedCopyOrMove;
   ClassDefiningSpecialMembersMap ClassWithSpecialMembers;
 };
 
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index 3f1d2f9f580991..c2d9286312dc4a 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -192,9 +192,7 @@ void UnusedParametersCheck::check(const MatchFinder::MatchResult &Result) {
 
     // In non-strict mode ignore function definitions with empty bodies
     // (constructor initializer counts for non-empty body).
-    if (StrictMode ||
-        (Function->getBody()->child_begin() !=
-         Function->getBody()->child_end()) ||
+    if (StrictMode || !Function->getBody()->children().empty() ||
         (isa<CXXConstructorDecl>(Function) &&
          cast<CXXConstructorDecl>(Function)->getNumCtorInitializers() > 0))
       warnOnUnusedParameter(Result, Function, I);
diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
index 45f7700463d570..418699ffbc4d1a 100644
--- a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
@@ -129,17 +129,17 @@ generateReplacements(const MatchFinder::MatchResult &Match,
       continue;
     }
 
+    // if the nested call is not the same as the top call
+    if (InnerCall->getDirectCallee()->getQualifiedNameAsString() !=
+        TopCall->getDirectCallee()->getQualifiedNameAsString())
+      continue;
+
     const FindArgsResult InnerResult = findArgs(InnerCall);
 
     // if the nested call doesn't have arguments skip it
     if (!InnerResult.First || !InnerResult.Last)
       continue;
 
-    // if the nested call is not the same as the top call
-    if (InnerCall->getDirectCallee()->getQualifiedNameAsString() !=
-        TopCall->getDirectCallee()->getQualifiedNameAsString())
-      continue;
-
     // if the nested call doesn't have the same compare function
     if ((Result.Compare || InnerResult.Compare) &&
         !utils::areStatementsIdentical(Result.Compare, InnerResult.Compare,
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 6d7d1d6b87c60a..7a021fe14436a1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -41,6 +41,8 @@ AST_MATCHER(FunctionDecl, hasOtherDeclarations) {
 void UseConstraintsCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       functionTemplateDecl(
+          // Skip external libraries included as system headers
+          unless(isExpansionInSystemHeader()),
           has(functionDecl(unless(hasOtherDeclarations()), isDefinition(),
                            hasReturnTypeLoc(typeLoc().bind("return")))
                   .bind("function")))
@@ -57,6 +59,8 @@ matchEnableIfSpecializationImplTypename(TypeLoc TheType) {
       return std::nullopt;
     }
     TheType = Dep.getQualifierLoc().getTypeLoc();
+    if (TheType.isNull())
+      return std::nullopt;
   }
 
   if (const auto SpecializationLoc =
@@ -254,7 +258,7 @@ findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) {
         return utils::lexer::findPreviousTokenKind(Init->getSourceLocation(),
                                                    SM, LangOpts, tok::colon);
     }
-    if (Constructor->init_begin() != Constructor->init_end())
+    if (!Constructor->inits().empty())
       return std::nullopt;
   }
   if (Function->isDeleted()) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19f8307412956b..71734617bf7aa8 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -262,6 +262,12 @@ Changes in existing checks
   <clang-tidy/checks/cppcoreguidelines/use-default-member-init>`. Fixed
   incorrect hints when using list-initialization.
 
+- Improved :doc:`cppcoreguidelines-special-member-functions
+  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check with a
+  new option `AllowImplicitlyDeletedCopyOrMove`, which removes the requirement
+  for explicit copy or move special member functions when they are already
+  implicitly deleted.
+
 - Improved :doc:`google-build-namespaces
   <clang-tidy/checks/google/build-namespaces>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
@@ -322,6 +328,10 @@ Changes in existing checks
   don't remove parentheses used in ``sizeof`` calls when they have array index
   accesses as arguments.
 
+- Improved :doc:`modernize-use-constraints
+  <clang-tidy/checks/modernize/use-constraints>` check by fixing a crash that
+  occurred in some scenarios and excluding system headers from analysis.
+
 - Improved :doc:`modernize-use-nullptr
   <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
   which also has introduced the ``nullptr`` keyword.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
index 176956d6cb2bd7..20f898fdab9305 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
@@ -45,9 +45,10 @@ Options
 
 .. option:: AllowMissingMoveFunctions
 
-   When set to `true` (default is `false`), this check doesn't flag classes which define no move
-   operations at all. It still flags classes which define only one of either
-   move constructor or move assignment operator. With this option enabled, the following class won't be flagged:
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which define no move operations at all. It still flags classes which define
+   only one of either move constructor or move assignment operator. With this
+   option enabled, the following class won't be flagged:
 
    .. code-block:: c++
 
@@ -59,10 +60,11 @@ Options
 
 .. option:: AllowMissingMoveFunctionsWhenCopyIsDeleted
 
-   When set to `true` (default is `false`), this check doesn't flag classes which define deleted copy
-   operations but don't define move operations. This flag is related to Google C++ Style Guide
-   https://google.github.io/styleguide/cppguide.html#Copyable_Movable_Types. With this option enabled, the
-   following class won't be flagged:
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which define deleted copy operations but don't define move operations. This
+   flag is related to Google C++ Style Guide `Copyable and Movable Types
+   <https://google.github.io/styleguide/cppguide.html#Copyable_Movable_Types>`_.
+   With this option enabled, the following class won't be flagged:
 
    .. code-block:: c++
 
@@ -71,3 +73,15 @@ Options
        A& operator=(const A&) = delete;
        ~A();
      };
+
+.. option:: AllowImplicitlyDeletedCopyOrMove
+
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which implicitly delete copy or move operations.
+   With this option enabled, the following class won't be flagged:
+
+   .. code-block:: c++
+
+      struct A : boost::noncopyable {
+        ~A() { std::cout << "dtor\n"; }
+      };
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
index be62dd5823d552..a8b31b80e580b0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
@@ -68,3 +68,7 @@ The tool will replace the above code with,
   // The tool will not emit a diagnostic or attempt to replace the code.
   template <typename T, std::enable_if_t<T::some_trait, int> = 0>
   struct my_class {};
+
+.. note::
+
+  System headers are not analyzed by this check.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
index 0c17f57968a990..26142ccc835f29 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-special-member-functions %t -- -config="{CheckOptions: {cppcoreguidelines-special-member-functions.AllowMissingMoveFunctions: true, cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true}}" --
+// RUN: %check_clang_tidy %s cppcoreguidelines-special-member-functions %t -- -config="{CheckOptions: {cppcoreguidelines-special-member-functions.AllowMissingMoveFunctions: true, cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true, cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true}}" --
 
 // Don't warn on destructors without definitions, they might be defaulted in another TU.
 class DeclaresDestructor {
@@ -34,12 +34,13 @@ class DefinesCopyAssignment {
 class DefinesMoveConstructor {
   DefinesMoveConstructor(DefinesMoveConstructor &&);
 };
-// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveConstructor' defines a move constructor but does not define a destructor, a copy constructor, a copy assignment operator or a move assignment operator [cppcoreguidelines-special-member-functions]
+// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveConstructor' defines a move constructor but does not define a destructor or a move assignment operator [cppcoreguidelines-special-member-functions]
 
 class DefinesMoveAssignment {
   DefinesMoveAssignment &operator=(DefinesMoveAssignment &&);
 };
-// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveAssignment' defines a move assignment operator but does not define a destructor, a copy constructor, a copy assignment operator or a move constructor [cppcoreguidelines-special-member-functions]
+// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveAssignment' defines a move assignment operator but does not define a destructor or a move constructor [cppcoreguidelines-special-member-functions]
+
 class DefinesNothing {
 };
 
@@ -81,3 +82,22 @@ struct TemplateClass {
 // This should not cause problems.
 TemplateClass<int> InstantiationWithInt;
 TemplateClass<double> InstantiationWithDouble;
+
+struct NoCopy
+{
+  NoCopy() = default;
+  ~NoCopy() = default;
+
+  NoCopy(const NoCopy&) = delete;
+  NoCopy(NoCopy&&) = delete;
+
+  NoCopy& operator=(const NoCopy&) = delete;
+  NoCopy& operator=(NoCopy&&) = delete;
+};
+
+// CHECK-MESSAGES: [[@LINE+1]]:8: warning: class 'NonCopyable' defines a copy constructor but does not define a destructor or a copy assignment operator [cppcoreguidelines-special-member-functions]
+struct NonCopyable : NoCopy
+{
+  NonCopyable() = default;
+  NonCopyable(const NonCopyable&) = delete;
+};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
index 51ab9bda975f10..1f2dad2b933ca7 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
@@ -300,6 +300,27 @@ B maxTT2 = std::max(B(), std::max(B(), B()));
 B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
 // CHECK-FIXES: B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
 
+struct GH91982 {
+  int fun0Args();
+  int fun1Arg(int a);
+  int fun2Args(int a, int b);
+  int fun3Args(int a, int b, int c);
+  int fun4Args(int a, int b, int c, int d);
+
+  int foo() {
+    return std::max(
+        fun0Args(),
+        std::max(fun1Arg(0),
+                 std::max(fun2Args(0, 1),
+                          std::max(fun3Args(0, 1, 2), fun4Args(0, 1, 2, 3)))));
+// CHECK-MESSAGES: :[[@LINE-5]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: return std::max(
+// CHECK-FIXES-NEXT: {fun0Args(),
+// CHECK-FIXES-NEXT: fun1Arg(0),
+// CHECK-FIXES-NEXT: fun2Args(0, 1),
+// CHECK-FIXES-NEXT: fun3Args(0, 1, 2), fun4Args(0, 1, 2, 3)});
+  }
+};
 
 } // namespace
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
index 3ec44be8a1c8c3..3bcd5cd74024ea 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
@@ -724,3 +724,35 @@ void not_last_param() {
 }
 
 } // namespace enable_if_trailing_type_parameter
+
+
+// Issue fixes:
+
+namespace PR91872 {
+
+enum expression_template_option { value1, value2 };
+
+template <typename T> struct number_category {
+  static const int value = 0;
+};
+
+constexpr int number_kind_complex = 1;
+
+template <typename T, expression_template_option ExpressionTemplates>
+struct number {
+  using type = T;
+};
+
+template <typename T> struct component_type {
+  using type = T;
+};
+
+template <class T, expression_template_option ExpressionTemplates>
+inline typename std::enable_if<
+    number_category<T>::value == number_kind_complex,
+    component_type<number<T, ExpressionTemplates>>>::type::type
+abs(const number<T, ExpressionTemplates> &v) {
+  return {};
+}
+
+}
diff --git a/clang/include/clang/AST/VTTBuilder.h b/clang/include/clang/AST/VTTBuilder.h
index 4acbc1f9e96b28..3c19e61a8701ca 100644
--- a/clang/include/clang/AST/VTTBuilder.h
+++ b/clang/include/clang/AST/VTTBuilder.h
@@ -92,7 +92,7 @@ class VTTBuilder {
   using AddressPointsMapTy = llvm::DenseMap<BaseSubobject, uint64_t>;
 
   /// The sub-VTT indices for the bases of the most derived class.
-  llvm::DenseMap<BaseSubobject, uint64_t> SubVTTIndicies;
+  llvm::DenseMap<BaseSubobject, uint64_t> SubVTTIndices;
 
   /// The secondary virtual pointer indices of all subobjects of
   /// the most derived class.
@@ -148,8 +148,8 @@ class VTTBuilder {
   }
 
   /// Returns a reference to the sub-VTT indices.
-  const llvm::DenseMap<BaseSubobject, uint64_t> &getSubVTTIndicies() const {
-    return SubVTTIndicies;
+  const llvm::DenseMap<BaseSubobject, uint64_t> &getSubVTTIndices() const {
+    return SubVTTIndices;
   }
 
   /// Returns a reference to the secondary virtual pointer indices.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 1da74ac7c8bd1d..205e53f02b1a02 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2498,6 +2498,39 @@ bool ByteCodeExprGen<Emitter>::VisitAddrLabelExpr(const AddrLabelExpr *E) {
   return this->emitGetLocal(PT_Ptr, Offset, E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitConvertVectorExpr(
+    const ConvertVectorExpr *E) {
+  assert(Initializing);
+  const auto *VT = E->getType()->castAs<VectorType>();
+  QualType ElemType = VT->getElementType();
+  PrimType ElemT = classifyPrim(ElemType);
+  const Expr *Src = E->getSrcExpr();
+  PrimType SrcElemT =
+      classifyPrim(Src->getType()->castAs<VectorType>()->getElementType());
+
+  unsigned SrcOffset = this->allocateLocalPrimitive(Src, PT_Ptr, true, false);
+  if (!this->visit(Src))
+    return false;
+  if (!this->emitSetLocal(PT_Ptr, SrcOffset, E))
+    return false;
+
+  for (unsigned I = 0; I != VT->getNumElements(); ++I) {
+    if (!this->emitGetLocal(PT_Ptr, SrcOffset, E))
+      return false;
+    if (!this->emitArrayElemPop(SrcElemT, I, E))
+      return false;
+    if (SrcElemT != ElemT) {
+      if (!this->emitPrimCast(SrcElemT, ElemT, ElemType, E))
+        return false;
+    }
+    if (!this->emitInitElem(ElemT, I, E))
+      return false;
+  }
+
+  return true;
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/true,
                              /*NewInitializing=*/false);
@@ -3685,12 +3718,22 @@ bool ByteCodeExprGen<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
       return this->emitCastFP(ToSem, getRoundingMode(E), E);
     }
 
+    if (ToT == PT_IntAP)
+      return this->emitCastFloatingIntegralAP(Ctx.getBitWidth(ToQT), E);
+    if (ToT == PT_IntAPS)
+      return this->emitCastFloatingIntegralAPS(Ctx.getBitWidth(ToQT), E);
+
     // Float to integral.
     if (isIntegralType(ToT) || ToT == PT_Bool)
       return this->emitCastFloatingIntegral(ToT, E);
   }
 
   if (isIntegralType(FromT) || FromT == PT_Bool) {
+    if (ToT == PT_IntAP)
+      return this->emitCastAP(FromT, Ctx.getBitWidth(ToQT), E);
+    if (ToT == PT_IntAPS)
+      return this->emitCastAPS(FromT, Ctx.getBitWidth(ToQT), E);
+
     // Integral to integral.
     if (isIntegralType(ToT) || ToT == PT_Bool)
       return FromT != ToT ? this->emitCast(FromT, ToT, E) : true;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 6039a54d32a5b2..fba4e45b9aa23c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -123,6 +123,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitPackIndexingExpr(const PackIndexingExpr *E);
   bool VisitRecoveryExpr(const RecoveryExpr *E);
   bool VisitAddrLabelExpr(const AddrLabelExpr *E);
+  bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index fb7ee14515715a..7464f15cdb03b4 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -61,7 +61,7 @@ template <bool Signed> class IntegralAP final {
 
   IntegralAP(APInt V) : V(V) {}
   /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(-1, 1024) {}
+  IntegralAP() : IntegralAP(-1, 3) {}
 
   IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {
diff --git a/clang/lib/AST/VTTBuilder.cpp b/clang/lib/AST/VTTBuilder.cpp
index d58e8751778520..464a2014c430a0 100644
--- a/clang/lib/AST/VTTBuilder.cpp
+++ b/clang/lib/AST/VTTBuilder.cpp
@@ -189,7 +189,7 @@ void VTTBuilder::LayoutVTT(BaseSubobject Base, bool BaseIsVirtual) {
 
   if (!IsPrimaryVTT) {
     // Remember the sub-VTT index.
-    SubVTTIndicies[Base] = VTTComponents.size();
+    SubVTTIndices[Base] = VTTComponents.size();
   }
 
   uint64_t VTableIndex = VTTVTables.size();
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 567e85a02dc612..b4c724422c14aa 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -278,7 +278,11 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
 
   llvm::Function *AwaitSuspendIntrinsic = CGF.CGM.getIntrinsic(AwaitSuspendIID);
 
-  const auto AwaitSuspendCanThrow = StmtCanThrow(S.getSuspendExpr());
+  // SuspendHandle might throw since it also resumes the returned handle.
+  const bool AwaitSuspendCanThrow =
+      SuspendReturnType ==
+          CoroutineSuspendExpr::SuspendReturnType::SuspendHandle ||
+      StmtCanThrow(S.getSuspendExpr());
 
   llvm::CallBase *SuspendRet = nullptr;
   // FIXME: add call attributes?
@@ -307,10 +311,7 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
     break;
   }
   case CoroutineSuspendExpr::SuspendReturnType::SuspendHandle: {
-    assert(SuspendRet->getType()->isPointerTy());
-
-    auto ResumeIntrinsic = CGF.CGM.getIntrinsic(llvm::Intrinsic::coro_resume);
-    Builder.CreateCall(ResumeIntrinsic, SuspendRet);
+    assert(SuspendRet->getType()->isVoidTy());
     break;
   }
   }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 19c2ca99181fb7..926276c5dfab7c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6851,7 +6851,8 @@ class MappableExprsHandler {
       const ValueDecl *Mapper = nullptr, bool ForDeviceAddr = false,
       const ValueDecl *BaseDecl = nullptr, const Expr *MapExpr = nullptr,
       ArrayRef<OMPClauseMappableExprCommon::MappableExprComponentListRef>
-          OverlappedElements = std::nullopt) const {
+          OverlappedElements = std::nullopt,
+      bool AreBothBasePtrAndPteeMapped = false) const {
     // The following summarizes what has to be generated for each map and the
     // types below. The generated information is expressed in this order:
     // base pointer, section pointer, size, flags
@@ -7027,6 +7028,10 @@ class MappableExprsHandler {
     // &(ps->p), &(ps->p[0]), 33*sizeof(double), MEMBER_OF(4) | PTR_AND_OBJ | TO
     // (*) the struct this entry pertains to is the 4th element in the list
     //     of arguments, hence MEMBER_OF(4)
+    //
+    // map(p, p[:100])
+    // ===> map(p[:100])
+    // &p, &p[0], 100*sizeof(float), TARGET_PARAM | PTR_AND_OBJ | TO | FROM
 
     // Track if the map information being generated is the first for a capture.
     bool IsCaptureFirstInfo = IsFirstComponentList;
@@ -7050,6 +7055,8 @@ class MappableExprsHandler {
     const auto *OASE = dyn_cast<ArraySectionExpr>(AssocExpr);
     const auto *OAShE = dyn_cast<OMPArrayShapingExpr>(AssocExpr);
 
+    if (AreBothBasePtrAndPteeMapped && std::next(I) == CE)
+      return;
     if (isa<MemberExpr>(AssocExpr)) {
       // The base is the 'this' pointer. The content of the pointer is going
       // to be the base of the field being mapped.
@@ -7092,8 +7099,9 @@ class MappableExprsHandler {
         // can be associated with the combined storage if shared memory mode is
         // active or the base declaration is not global variable.
         const auto *VD = dyn_cast<VarDecl>(I->getAssociatedDeclaration());
-        if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
-            !VD || VD->hasLocalStorage())
+        if (!AreBothBasePtrAndPteeMapped &&
+            (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
+             !VD || VD->hasLocalStorage()))
           BP = CGF.EmitLoadOfPointer(BP, Ty->castAs<PointerType>());
         else
           FirstPointerInComplexData = true;
@@ -7415,11 +7423,13 @@ class MappableExprsHandler {
           // same expression except for the first one. We also need to signal
           // this map is the first one that relates with the current capture
           // (there is a set of entries for each capture).
-          OpenMPOffloadMappingFlags Flags = getMapTypeBits(
-              MapType, MapModifiers, MotionModifiers, IsImplicit,
-              !IsExpressionFirstInfo || RequiresReference ||
-                  FirstPointerInComplexData || IsMemberReference,
-              IsCaptureFirstInfo && !RequiresReference, IsNonContiguous);
+          OpenMPOffloadMappingFlags Flags =
+              getMapTypeBits(MapType, MapModifiers, MotionModifiers, IsImplicit,
+                             !IsExpressionFirstInfo || RequiresReference ||
+                                 FirstPointerInComplexData || IsMemberReference,
+                             AreBothBasePtrAndPteeMapped ||
+                                 (IsCaptureFirstInfo && !RequiresReference),
+                             IsNonContiguous);
 
           if (!IsExpressionFirstInfo || IsMemberReference) {
             // If we have a PTR_AND_OBJ pair where the OBJ is a pointer as well,
@@ -8513,6 +8523,8 @@ class MappableExprsHandler {
     assert(CurDir.is<const OMPExecutableDirective *>() &&
            "Expect a executable directive");
     const auto *CurExecDir = CurDir.get<const OMPExecutableDirective *>();
+    bool HasMapBasePtr = false;
+    bool HasMapArraySec = false;
     for (const auto *C : CurExecDir->getClausesOfKind<OMPMapClause>()) {
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->decl_component_lists(VD)) {
@@ -8524,6 +8536,11 @@ class MappableExprsHandler {
         assert(VDecl == VD && "We got information for the wrong declaration??");
         assert(!Components.empty() &&
                "Not expecting declaration with no component lists.");
+        if (VD && E && VD->getType()->isAnyPointerType() && isa<DeclRefExpr>(E))
+          HasMapBasePtr = true;
+        if (VD && E && VD->getType()->isAnyPointerType() &&
+            (isa<ArraySectionExpr>(E) || isa<ArraySubscriptExpr>(E)))
+          HasMapArraySec = true;
         DeclComponentLists.emplace_back(Components, C->getMapType(),
                                         C->getMapTypeModifiers(),
                                         C->isImplicit(), Mapper, E);
@@ -8706,7 +8723,9 @@ class MappableExprsHandler {
             MapType, MapModifiers, std::nullopt, Components, CombinedInfo,
             StructBaseCombinedInfo, PartialStruct, IsFirstComponentList,
             IsImplicit, /*GenerateAllInfoForClauses*/ false, Mapper,
-            /*ForDeviceAddr=*/false, VD, VarRef);
+            /*ForDeviceAddr=*/false, VD, VarRef,
+            /*OverlappedElements*/ std::nullopt,
+            HasMapBasePtr && HasMapArraySec);
       IsFirstComponentList = false;
     }
   }
diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index d2376b14dd5826..4cebb750c89e8f 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -138,23 +138,24 @@ uint64_t CodeGenVTables::getSubVTTIndex(const CXXRecordDecl *RD,
                                         BaseSubobject Base) {
   BaseSubobjectPairTy ClassSubobjectPair(RD, Base);
 
-  SubVTTIndiciesMapTy::iterator I = SubVTTIndicies.find(ClassSubobjectPair);
-  if (I != SubVTTIndicies.end())
+  SubVTTIndicesMapTy::iterator I = SubVTTIndices.find(ClassSubobjectPair);
+  if (I != SubVTTIndices.end())
     return I->second;
 
   VTTBuilder Builder(CGM.getContext(), RD, /*GenerateDefinition=*/false);
 
-  for (llvm::DenseMap<BaseSubobject, uint64_t>::const_iterator I =
-       Builder.getSubVTTIndicies().begin(),
-       E = Builder.getSubVTTIndicies().end(); I != E; ++I) {
+  for (llvm::DenseMap<BaseSubobject, uint64_t>::const_iterator
+           I = Builder.getSubVTTIndices().begin(),
+           E = Builder.getSubVTTIndices().end();
+       I != E; ++I) {
     // Insert all indices.
     BaseSubobjectPairTy ClassSubobjectPair(RD, I->first);
 
-    SubVTTIndicies.insert(std::make_pair(ClassSubobjectPair, I->second));
+    SubVTTIndices.insert(std::make_pair(ClassSubobjectPair, I->second));
   }
 
-  I = SubVTTIndicies.find(ClassSubobjectPair);
-  assert(I != SubVTTIndicies.end() && "Did not find index!");
+  I = SubVTTIndices.find(ClassSubobjectPair);
+  assert(I != SubVTTIndices.end() && "Did not find index!");
 
   return I->second;
 }
diff --git a/clang/lib/CodeGen/CGVTables.h b/clang/lib/CodeGen/CGVTables.h
index 9d4223547050d4..c06bf7a525d9fd 100644
--- a/clang/lib/CodeGen/CGVTables.h
+++ b/clang/lib/CodeGen/CGVTables.h
@@ -38,10 +38,10 @@ class CodeGenVTables {
   typedef VTableLayout::AddressPointsMapTy VTableAddressPointsMapTy;
 
   typedef std::pair<const CXXRecordDecl *, BaseSubobject> BaseSubobjectPairTy;
-  typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t> SubVTTIndiciesMapTy;
+  typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t> SubVTTIndicesMapTy;
 
-  /// SubVTTIndicies - Contains indices into the various sub-VTTs.
-  SubVTTIndiciesMapTy SubVTTIndicies;
+  /// SubVTTIndices - Contains indices into the various sub-VTTs.
+  SubVTTIndicesMapTy SubVTTIndices;
 
   typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t>
     SecondaryVirtualPointerIndicesMapTy;
diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp
index 9025a633f328e2..561f0b514d909b 100644
--- a/clang/lib/CodeGen/Targets/Sparc.cpp
+++ b/clang/lib/CodeGen/Targets/Sparc.cpp
@@ -263,7 +263,11 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
 
   CoerceBuilder CB(getVMContext(), getDataLayout());
   CB.addStruct(0, StrTy);
-  CB.pad(llvm::alignTo(CB.DL.getTypeSizeInBits(StrTy), 64));
+  // All structs, even empty ones, should take up a register argument slot,
+  // so pin the minimum struct size to one bit.
+  CB.pad(llvm::alignTo(
+      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), uint64_t(1)),
+      64));
 
   // Try to use the original type for coercion.
   llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType();
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index e20d9fb1cfc417..3770471bae7c0d 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -375,7 +375,8 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component,
   if (Component == "builtins") {
     SmallString<128> Path(getDriver().SysRoot);
     llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a");
-    return std::string(Path);
+    if (getVFS().exists(Path))
+      return std::string(Path);
   }
   SmallString<128> P(getDriver().ResourceDir);
   std::string CRTBasename =
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index adcbe5858bc78e..869b9c6669c27f 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1439,6 +1439,8 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     }
   }
 
+  Sema::FPFeaturesStateRAII SaveFPFeatures(Actions);
+
   // Tell the actions module that we have entered a function definition with the
   // specified Declarator for the function.
   SkipBodyInfo SkipBody;
@@ -1497,8 +1499,6 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     return Actions.ActOnFinishFunctionBody(Res, nullptr, false);
   }
 
-  Sema::FPFeaturesStateRAII SaveFPFeatures(Actions);
-
   if (Tok.is(tok::kw_try))
     return ParseFunctionTryBlock(Res, BodyScope);
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index e138debd1361ca..92347f8fafc000 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -103,9 +103,8 @@ class RAIIMutexDescriptor {
       // this function is called instead of early returning it. To avoid this, a
       // bool variable (IdentifierInfoInitialized) is used and the function will
       // be run only once.
-      Guard = &Call.getCalleeAnalysisDeclContext()->getASTContext().Idents.get(
-          GuardName);
-      IdentifierInfoInitialized = true;
+      const auto &ASTCtx = Call.getState()->getStateManager().getContext();
+      Guard = &ASTCtx.Idents.get(GuardName);
     }
   }
 
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index 0cbab1fcd91d09..afdfd25527e4a1 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -639,3 +639,238 @@ void test7(void) {
 /// the actual implementation uses analyze_os_log::computeOSLogBufferLayout(), which
 /// is tested elsewhere.
 static_assert(__builtin_os_log_format_buffer_size("%{mask.xyz}s", "abc") != 0, "");
+
+/// Copied from test/Sema/constant_builtins_vector.cpp.
+/// Some tests are missing since we run this for multiple targets,
+/// some of which do not support _BitInt.
+#ifndef __AVR__
+namespace convertvector {
+  typedef _BitInt(128) BitInt128;
+
+  typedef double vector4double __attribute__((__vector_size__(32)));
+  typedef float vector4float __attribute__((__vector_size__(16)));
+  typedef long long vector4long __attribute__((__vector_size__(32)));
+  typedef int vector4int __attribute__((__vector_size__(16)));
+  typedef short vector4short __attribute__((__vector_size__(8)));
+  typedef char vector4char __attribute__((__vector_size__(4)));
+  typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
+  typedef double vector8double __attribute__((__vector_size__(64)));
+  typedef float vector8float __attribute__((__vector_size__(32)));
+  typedef long long vector8long __attribute__((__vector_size__(64)));
+  typedef int vector8int __attribute__((__vector_size__(32)));
+  typedef short vector8short __attribute__((__vector_size__(16)));
+  typedef char vector8char __attribute__((__vector_size__(8)));
+  typedef BitInt128 vector8BitInt128 __attribute__((__vector_size__(128)));
+
+  constexpr vector4double from_vector4double_to_vector4double_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4double_to_vector4float_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4double_to_vector4long_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4double_to_vector4int_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4double_to_vector4short_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4double_to_vector4char_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4double_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4float_to_vector4double_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4float_to_vector4float_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4float_to_vector4long_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4float_to_vector4int_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4float_to_vector4short_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4float_to_vector4char_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4float_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4long_to_vector4double_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4long_to_vector4float_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4long_to_vector4long_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4long_to_vector4int_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4long_to_vector4short_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4long_to_vector4char_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4long_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4int_to_vector4double_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4int_to_vector4float_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4int_to_vector4long_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4int_to_vector4int_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4int_to_vector4short_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4int_to_vector4char_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4int_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4short_to_vector4double_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4short_to_vector4float_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4short_to_vector4long_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4short_to_vector4int_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4short_to_vector4short_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4short_to_vector4char_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4short_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4char_to_vector4double_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4char_to_vector4float_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4char_to_vector4long_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4char_to_vector4int_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4char_to_vector4short_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4char_to_vector4char_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4char);
+  constexpr vector8double from_vector8double_to_vector8double_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8double_to_vector8float_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8double_to_vector8long_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8double_to_vector8int_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8int);
+  constexpr vector8short from_vector8double_to_vector8short_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8double_to_vector8char_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8double_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  constexpr vector8double from_vector8float_to_vector8double_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8float_to_vector8float_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8float_to_vector8long_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8float_to_vector8int_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8float_to_vector8short_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8float_to_vector8char_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8float_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  constexpr vector8double from_vector8long_to_vector8double_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8long_to_vector8float_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8long_to_vector8long_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8long_to_vector8int_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8long_to_vector8short_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8long_to_vector8char_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8int_to_vector8double_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8int_to_vector8float_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8float);
+  constexpr vector8long from_vector8int_to_vector8long_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8int_to_vector8int_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8int_to_vector8short_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8short);
+  constexpr vector8char from_vector8int_to_vector8char_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8short_to_vector8double_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8short_to_vector8float_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8short_to_vector8long_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8short_to_vector8int_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8short_to_vector8short_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8short_to_vector8char_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+
+  constexpr vector8double from_vector8char_to_vector8double_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8char_to_vector8float_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8char_to_vector8long_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8char_to_vector8int_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8char_to_vector8short_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8char_to_vector8char_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8BitInt128_to_vector8double_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8BitInt128_to_vector8float_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8BitInt128_to_vector8long_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8BitInt128_to_vector8int_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8int);
+  constexpr vector8short from_vector8BitInt128_to_vector8short_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8BitInt128_to_vector8char_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8BitInt128_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[0] == 0, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[1] == 1, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[2] == 2, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[3] == 3, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[4] == 4, ""); // ref-error {{not an integral constant expression}}
+}
+#endif
diff --git a/clang/test/Analysis/block-in-critical-section.c b/clang/test/Analysis/block-in-critical-section.c
new file mode 100644
index 00000000000000..1e174af541b183
--- /dev/null
+++ b/clang/test/Analysis/block-in-critical-section.c
@@ -0,0 +1,6 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.BlockInCriticalSection -verify %s
+// expected-no-diagnostics
+
+// This should not crash
+int (*a)(void);
+void b(void) { a(); }
diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/sparcv9-abi.c
index 5e74a9a883cefa..616e24e7c519d8 100644
--- a/clang/test/CodeGen/sparcv9-abi.c
+++ b/clang/test/CodeGen/sparcv9-abi.c
@@ -21,6 +21,47 @@ char f_int_4(char x) { return x; }
 // CHECK-LABEL: define{{.*}} fp128 @f_ld(fp128 noundef %x)
 long double f_ld(long double x) { return x; }
 
+// Zero-sized structs reserves an argument register slot if passed directly.
+struct empty {};
+struct emptyarr { struct empty a[10]; };
+
+// CHECK-LABEL: define{{.*}} i64 @f_empty(i64 %x.coerce)
+struct empty f_empty(struct empty x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyarr(i64 %x.coerce)
+struct empty f_emptyarr(struct emptyarr x) { return x.a[0]; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyvar(i32 noundef zeroext %count, ...)
+long f_emptyvar(unsigned count, ...) {
+    long ret;
+    va_list args;
+    va_start(args, count);
+
+// CHECK: %[[CUR:[^ ]+]] = load ptr, ptr %args
+// CHECK-DAG: %[[NXT:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i64 8
+// CHECK-DAG: store ptr %[[NXT]], ptr %args
+    va_arg(args, struct empty);
+
+// CHECK: %[[CUR:[^ ]+]] = load ptr, ptr %args
+// CHECK-DAG: %[[NXT:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i64 8
+// CHECK-DAG: store ptr %[[NXT]], ptr %args
+// CHECK-DAG: load i64, ptr %[[CUR]]
+    ret = va_arg(args, long);
+    va_end(args);
+    return ret;
+}
+
+// If the zero-sized struct is contained in a non-zero-sized struct,
+// though, it doesn't reserve any registers.
+struct emptymixed { struct empty a; long b; };
+struct emptyflex { unsigned count; struct empty data[10]; };
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptymixed(i64 %x.coerce)
+long f_emptymixed(struct emptymixed x) { return x.b; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyflex(i64 %x.coerce, i64 noundef %y)
+long f_emptyflex(struct emptyflex x, long y) { return y; }
+
 // Small structs are passed in registers.
 struct small {
   int *a, *b;
diff --git a/clang/test/CodeGen/sparcv9-class-return.cpp b/clang/test/CodeGen/sparcv9-class-return.cpp
new file mode 100644
index 00000000000000..2428219422d8a0
--- /dev/null
+++ b/clang/test/CodeGen/sparcv9-class-return.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple sparcv9-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+class Empty {
+};
+
+class Long : public Empty {
+public:
+  long l;
+};
+
+// CHECK: define{{.*}} i64 @_Z4foo15Empty(i64 %e.coerce)
+Empty foo1(Empty e) {
+  return e;
+}
+
+// CHECK: define{{.*}} %class.Long @_Z4foo24Long(i64 %l.coerce)
+Long foo2(Long l) {
+  return l;
+}
+
+// CHECK: define{{.*}} i64 @_Z4foo34Long(i64 %l.coerce)
+long foo3(Long l) {
+  return l.l;
+}
diff --git a/clang/test/CodeGenCoroutines/coro-await.cpp b/clang/test/CodeGenCoroutines/coro-await.cpp
index 65bfb099468817..c7a09e8b8bc7c4 100644
--- a/clang/test/CodeGenCoroutines/coro-await.cpp
+++ b/clang/test/CodeGenCoroutines/coro-await.cpp
@@ -370,8 +370,8 @@ extern "C" void TestTailcall() {
   // ---------------------------
   // Call coro.await.suspend
   // ---------------------------
-  // CHECK-NEXT: %[[RESUMED:.+]] = call ptr @llvm.coro.await.suspend.handle(ptr %[[AWAITABLE]], ptr %[[FRAME]], ptr @TestTailcall.__await_suspend_wrapper__await)
-  // CHECK-NEXT: call void @llvm.coro.resume(ptr %[[RESUMED]])
+  // Note: The call must not be nounwind since the resumed function could throw.
+  // CHECK-NEXT: call void @llvm.coro.await.suspend.handle(ptr %[[AWAITABLE]], ptr %[[FRAME]], ptr @TestTailcall.__await_suspend_wrapper__await){{$}}
   // CHECK-NEXT: %[[OUTCOME:.+]] = call i8 @llvm.coro.suspend(token %[[SUSPEND_ID]], i1 false)
   // CHECK-NEXT: switch i8 %[[OUTCOME]], label %[[RET_BB:.+]] [
   // CHECK-NEXT:   i8 0, label %[[READY_BB]]
diff --git a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp
deleted file mode 100644
index da30e12c63cffb..00000000000000
--- a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O0 -emit-llvm %s -o - -disable-llvm-passes | FileCheck %s
-// RUN: %clang -std=c++20 -O0 -emit-llvm -c  %s -o %t -Xclang -disable-llvm-passes && %clang -c %t
-
-#include "Inputs/coroutine.h"
-
-struct detached_task {
-  struct promise_type {
-    detached_task get_return_object() noexcept {
-      return detached_task{std::coroutine_handle<promise_type>::from_promise(*this)};
-    }
-
-    void return_void() noexcept {}
-
-    struct final_awaiter {
-      bool await_ready() noexcept { return false; }
-      std::coroutine_handle<> await_suspend(std::coroutine_handle<promise_type> h) noexcept {
-        h.destroy();
-        return {};
-      }
-      void await_resume() noexcept {}
-    };
-
-    void unhandled_exception() noexcept {}
-
-    final_awaiter final_suspend() noexcept { return {}; }
-
-    std::suspend_always initial_suspend() noexcept { return {}; }
-  };
-
-  ~detached_task() {
-    if (coro_) {
-      coro_.destroy();
-      coro_ = {};
-    }
-  }
-
-  void start() && {
-    auto tmp = coro_;
-    coro_ = {};
-    tmp.resume();
-  }
-
-  std::coroutine_handle<promise_type> coro_;
-};
-
-detached_task foo() {
-  co_return;
-}
-
-// check that the lifetime of the coroutine handle used to obtain the address is contained within single basic block, and hence does not live across suspension points.
-// CHECK-LABEL: final.suspend:
-// CHECK:         %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK:         %[[HDL_TRANSFER:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK:         call void @llvm.coro.resume(ptr %[[HDL_TRANSFER]])
diff --git a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
index ca6cf74115a3b1..f36f89926505f3 100644
--- a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
+++ b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
@@ -89,8 +89,7 @@ Task bar() {
 // CHECK:         br i1 %{{.+}}, label %[[CASE1_AWAIT_READY:.+]], label %[[CASE1_AWAIT_SUSPEND:.+]]
 // CHECK:       [[CASE1_AWAIT_SUSPEND]]:
 // CHECK-NEXT:    %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK-NEXT:    %[[HANDLE1_PTR:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK-NEXT:    call void @llvm.coro.resume(ptr %[[HANDLE1_PTR]])
+// CHECK-NEXT:    call void @llvm.coro.await.suspend.handle
 // CHECK-NEXT:    %{{.+}} = call i8 @llvm.coro.suspend
 // CHECK-NEXT:    switch i8 %{{.+}}, label %coro.ret [
 // CHECK-NEXT:      i8 0, label %[[CASE1_AWAIT_READY]]
@@ -104,8 +103,7 @@ Task bar() {
 // CHECK:         br i1 %{{.+}}, label %[[CASE2_AWAIT_READY:.+]], label %[[CASE2_AWAIT_SUSPEND:.+]]
 // CHECK:       [[CASE2_AWAIT_SUSPEND]]:
 // CHECK-NEXT:    %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK-NEXT:    %[[HANDLE2_PTR:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK-NEXT:    call void @llvm.coro.resume(ptr %[[HANDLE2_PTR]])
+// CHECK-NEXT:    call void @llvm.coro.await.suspend.handle
 // CHECK-NEXT:    %{{.+}} = call i8 @llvm.coro.suspend
 // CHECK-NEXT:    switch i8 %{{.+}}, label %coro.ret [
 // CHECK-NEXT:      i8 0, label %[[CASE2_AWAIT_READY]]
diff --git a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
new file mode 100644
index 00000000000000..e2c27f37f5b9db
--- /dev/null
+++ b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
@@ -0,0 +1,150 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+extern void *malloc (int __size) throw () __attribute__ ((__malloc__));
+
+void foo() {
+  int *ptr = (int *) malloc(3 * sizeof(int));
+
+  #pragma omp target map(ptr, ptr[0:2])
+  {
+    ptr[1] = 6;
+  }
+  #pragma omp target map(ptr, ptr[2])
+  {
+    ptr[2] = 8;
+  }
+}
+#endif
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noalias noundef ptr @_Z6malloci(i32 noundef signext 12) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    store ptr [[CALL]], ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes, ptr [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK:       omp_offload.failed:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15(ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK:       omp_offload.cont:
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 2
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX1]], ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CHECK-NEXT:    store ptr null, ptr [[TMP26]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP30]], align 4
+// CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP27]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP28]], ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP33]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP34]], align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK-NEXT:    store ptr null, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP37]], align 8
+// CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP38]], align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP39]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP40]], align 4
+// CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP41]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19.region_id, ptr [[KERNEL_ARGS5]])
+// CHECK-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
+// CHECK-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK:       omp_offload.failed6:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19(ptr [[TMP22]]) #[[ATTR3]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK:       omp_offload.cont7:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15
+// CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+// CHECK-NEXT:    store i32 6, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19
+// CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+// CHECK-NEXT:    store i32 8, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/PCH/optnone.cpp b/clang/test/PCH/optnone.cpp
new file mode 100644
index 00000000000000..8351bd9de70dd6
--- /dev/null
+++ b/clang/test/PCH/optnone.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -emit-pch -x c++-header %s -o %t.pch
+// RUN: %clang_cc1 -emit-llvm -DMAIN -include-pch %t.pch %s -o /dev/null
+
+#ifndef MAIN
+__attribute__((optnone)) void foo() {}
+#endif
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 86afe22fac24cc..4eb753a7297a92 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -576,9 +576,6 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
     Str.get()->emitZeros(1);
   }
 
-  // Assembly to object compilation should leverage assembly info.
-  Str->setUseAssemblerInfoForParsing(true);
-
   bool Failed = false;
 
   std::unique_ptr<MCAsmParser> Parser(
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
index cb5579b523dcf8..d6954a440f1ae4 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
@@ -12,7 +12,7 @@
 
 class DexExpectStepOrder(CommandBase):
     """Expect the line every `DexExpectStepOrder` is found on to be stepped on
-    in `order`. Each instance must have a set of unique ascending indicies.
+    in `order`. Each instance must have a set of unique ascending indices.
 
     DexExpectStepOrder(*order)
 
diff --git a/flang/docs/HighLevelFIR.md b/flang/docs/HighLevelFIR.md
index de8dc5a1959bb9..2399efcdeacd3d 100644
--- a/flang/docs/HighLevelFIR.md
+++ b/flang/docs/HighLevelFIR.md
@@ -590,7 +590,7 @@ Syntax:
 
 Note that %indices are not operands, they are the elemental region block
 arguments, representing the array iteration space in a one based fashion.
-The choice of using one based indicies is to match Fortran default for
+The choice of using one based indices is to match Fortran default for
 array variables, so that there is no need to generate bound adjustments
 when working with one based array variables in an expression.
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f21acdd64d7c3e..f05cf1f5120f8d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1233,8 +1233,7 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item,
+              const ConstructQueue &queue, ConstructQueue::iterator item,
               const std::optional<Fortran::parser::Name> &name) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::FlatSymbolRefAttr nameAttr;
@@ -1245,8 +1244,8 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
     auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
     if (!global) {
       mlir::omp::CriticalClauseOps clauseOps;
-      genCriticalDeclareClauses(converter, semaCtx, clauses, loc, clauseOps,
-                                nameStr);
+      genCriticalDeclareClauses(converter, semaCtx, item->clauses, loc,
+                                clauseOps, nameStr);
 
       mlir::OpBuilder modBuilder(mod.getBodyRegion());
       global = modBuilder.create<mlir::omp::CriticalDeclareOp>(loc, clauseOps);
@@ -1266,8 +1265,7 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                const List<Clause> &clauses, const ConstructQueue &queue,
-                ConstructQueue::iterator item) {
+                const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Distribute construct");
   return nullptr;
 }
@@ -1277,10 +1275,11 @@ genFlushOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-           const ObjectList &objects, const List<Clause> &clauses,
-           const ConstructQueue &queue, ConstructQueue::iterator item) {
+           const ObjectList &objects, const ConstructQueue &queue,
+           ConstructQueue::iterator item) {
   llvm::SmallVector<mlir::Value> operandRange;
-  genFlushClauses(converter, semaCtx, objects, clauses, loc, operandRange);
+  genFlushClauses(converter, semaCtx, objects, item->clauses, loc,
+                  operandRange);
 
   return converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
       converter.getCurrentLocation(), operandRange);
@@ -1291,8 +1290,7 @@ genMasterOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   return genOpWithBody<mlir::omp::MasterOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_master),
@@ -1304,8 +1302,7 @@ genOrderedOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-             const List<Clause> &clauses, const ConstructQueue &queue,
-             ConstructQueue::iterator item) {
+             const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "OMPD_ordered");
   return nullptr;
 }
@@ -1315,10 +1312,9 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                   const List<Clause> &clauses, const ConstructQueue &queue,
-                   ConstructQueue::iterator item) {
+                   const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::OrderedRegionClauseOps clauseOps;
-  genOrderedRegionClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genOrderedRegionClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
@@ -1331,15 +1327,15 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item, bool outerCombined = false) {
+              const ConstructQueue &queue, ConstructQueue::iterator item,
+              bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::ParallelClauseOps clauseOps;
   llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-  genParallelClauses(converter, semaCtx, stmtCtx, clauses, loc,
+  genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                      /*processReduction=*/!outerCombined, clauseOps,
                      reductionTypes, reductionSyms);
 
@@ -1352,7 +1348,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_parallel)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauses)
+          .setClauses(&item->clauses)
           .setReductions(&reductionSyms, &reductionTypes)
           .setGenRegionEntryCb(reductionCallback);
 
@@ -1361,7 +1357,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
                                                 clauseOps);
 
   bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval,
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*useDelayedPrivatization=*/true, &symTable);
 
   if (privatize)
@@ -1414,14 +1410,13 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-             const List<Clause> &clauses, const ConstructQueue &queue,
-             ConstructQueue::iterator item) {
+             const ConstructQueue &queue, ConstructQueue::iterator item) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_section)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item);
 }
 
@@ -1430,22 +1425,21 @@ genSectionsOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::SectionsClauseOps clauseOps;
-  genSectionsClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   auto &builder = converter.getFirOpBuilder();
 
   // Insert privatizations before SECTIONS
   symTable.pushScope();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   List<Clause> nonDsaClauses;
   List<const clause::Lastprivate *> lastprivates;
 
-  for (const Clause &clause : clauses) {
+  for (const Clause &clause : item->clauses) {
     if (clause.id == llvm::omp::Clause::OMPC_lastprivate) {
       lastprivates.push_back(&std::get<clause::Lastprivate>(clause.u));
     } else {
@@ -1508,18 +1502,18 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-          const List<Clause> &clauses, const ConstructQueue &queue,
-          ConstructQueue::iterator item) {
+          const ConstructQueue &queue, ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::LoopNestClauseOps loopClauseOps;
   mlir::omp::SimdClauseOps simdClauseOps;
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
-  genLoopNestClauses(converter, semaCtx, eval, clauses, loc, loopClauseOps, iv);
-  genSimdClauses(converter, semaCtx, clauses, loc, simdClauseOps);
+  genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
+                     loopClauseOps, iv);
+  genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
   // Create omp.simd wrapper.
   auto simdOp = firOpBuilder.create<mlir::omp::SimdOp>(loc, simdClauseOps);
@@ -1532,7 +1526,8 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
   // Create nested omp.loop_nest and fill body with loop contents.
   auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(loc, loopClauseOps);
 
-  auto *nestedEval = getCollapsedLoopEval(eval, getCollapseValue(clauses));
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, getCollapseValue(item->clauses));
 
   auto ivCallback = [&](mlir::Operation *op) {
     genLoopVars(op, converter, loc, iv);
@@ -1542,7 +1537,7 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
   createBodyOfOp(*loopOp,
                  OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
                                    *nestedEval, llvm::omp::Directive::OMPD_simd)
-                     .setClauses(&clauses)
+                     .setClauses(&item->clauses)
                      .setDataSharingProcessor(&dsp)
                      .setGenRegionEntryCb(ivCallback),
                  queue, item);
@@ -1555,15 +1550,14 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::SingleClauseOps clauseOps;
-  genSingleClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genSingleClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_single)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1572,8 +1566,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item, bool outerCombined = false) {
+            const ConstructQueue &queue, ConstructQueue::iterator item,
+            bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
 
@@ -1586,7 +1580,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
       deviceAddrSyms;
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
   llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
-  genTargetClauses(converter, semaCtx, stmtCtx, clauses, loc,
+  genTargetClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    processHostOnlyClauses, /*processReduction=*/outerCombined,
                    clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms,
                    deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
@@ -1690,15 +1684,14 @@ genTargetDataOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                const List<Clause> &clauses, const ConstructQueue &queue,
-                ConstructQueue::iterator item) {
+                const ConstructQueue &queue, ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TargetDataClauseOps clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
-  genTargetDataClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps,
-                       useDeviceTypes, useDeviceLocs, useDeviceSyms);
+  genTargetDataClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       clauseOps, useDeviceTypes, useDeviceLocs, useDeviceSyms);
 
   auto targetDataOp =
       converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
@@ -1714,8 +1707,7 @@ static OpTy
 genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
-                               mlir::Location loc, const List<Clause> &clauses,
-                               const ConstructQueue &queue,
+                               mlir::Location loc, const ConstructQueue &queue,
                                ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
@@ -1733,8 +1725,8 @@ genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
   }
 
   mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
-  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauses, loc,
-                                      directive, clauseOps);
+  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx,
+                                      item->clauses, loc, directive, clauseOps);
 
   return firOpBuilder.create<OpTy>(loc, clauseOps);
 }
@@ -1744,16 +1736,15 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-          const List<Clause> &clauses, const ConstructQueue &queue,
-          ConstructQueue::iterator item) {
+          const ConstructQueue &queue, ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TaskClauseOps clauseOps;
-  genTaskClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
+  genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_task)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1762,15 +1753,14 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-               const List<Clause> &clauses, const ConstructQueue &queue,
-               ConstructQueue::iterator item) {
+               const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::TaskgroupClauseOps clauseOps;
-  genTaskgroupClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_taskgroup)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1779,8 +1769,7 @@ genTaskloopOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Taskloop construct");
 }
 
@@ -1789,10 +1778,9 @@ genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::TaskwaitClauseOps clauseOps;
-  genTaskwaitClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genTaskwaitClauses(converter, semaCtx, item->clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
                                                                    clauseOps);
 }
@@ -1811,17 +1799,17 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-           const List<Clause> &clauses, const ConstructQueue &queue,
-           ConstructQueue::iterator item, bool outerCombined = false) {
+           const ConstructQueue &queue, ConstructQueue::iterator item,
+           bool outerCombined = false) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TeamsClauseOps clauseOps;
-  genTeamsClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
+  genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1830,10 +1818,9 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -1842,8 +1829,9 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-  genLoopNestClauses(converter, semaCtx, eval, clauses, loc, loopClauseOps, iv);
-  genWsloopClauses(converter, semaCtx, stmtCtx, clauses, loc, wsClauseOps,
+  genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
+                     loopClauseOps, iv);
+  genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, wsClauseOps,
                    reductionTypes, reductionSyms);
 
   // Create omp.wsloop wrapper and populate entry block arguments with reduction
@@ -1858,7 +1846,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   // Create nested omp.loop_nest and fill body with loop contents.
   auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(loc, loopClauseOps);
 
-  auto *nestedEval = getCollapsedLoopEval(eval, getCollapseValue(clauses));
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, getCollapseValue(item->clauses));
 
   auto ivCallback = [&](mlir::Operation *op) {
     genLoopVars(op, converter, loc, iv, reductionSyms,
@@ -1869,7 +1858,7 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   createBodyOfOp(*loopOp,
                  OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
                                    *nestedEval, llvm::omp::Directive::OMPD_do)
-                     .setClauses(&clauses)
+                     .setClauses(&item->clauses)
                      .setDataSharingProcessor(&dsp)
                      .setReductions(&reductionSyms, &reductionTypes)
                      .setGenRegionEntryCb(ivCallback),
@@ -1886,8 +1875,7 @@ static void genCompositeDistributeParallelDo(
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-    const List<Clause> &clauses, const ConstructQueue &queue,
-    ConstructQueue::iterator item) {
+    const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO");
 }
 
@@ -1896,8 +1884,7 @@ static void genCompositeDistributeParallelDoSimd(
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-    const List<Clause> &clauses, const ConstructQueue &queue,
-    ConstructQueue::iterator item) {
+    const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD");
 }
 
@@ -1906,8 +1893,7 @@ genCompositeDistributeSimd(Fortran::lower::AbstractConverter &converter,
                            Fortran::lower::SymMap &symTable,
                            Fortran::semantics::SemanticsContext &semaCtx,
                            Fortran::lower::pft::Evaluation &eval,
-                           mlir::Location loc, const List<Clause> &clauses,
-                           const ConstructQueue &queue,
+                           mlir::Location loc, const ConstructQueue &queue,
                            ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE SIMD");
 }
@@ -1916,10 +1902,9 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
                                Fortran::lower::pft::Evaluation &eval,
-                               mlir::Location loc, const List<Clause> &clauses,
-                               const ConstructQueue &queue,
+                               mlir::Location loc, const ConstructQueue &queue,
                                ConstructQueue::iterator item) {
-  ClauseProcessor cp(converter, semaCtx, clauses);
+  ClauseProcessor cp(converter, semaCtx, item->clauses);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
                  clause::Order, clause::Safelen, clause::Simdlen>(
       loc, llvm::omp::OMPD_do_simd);
@@ -1931,7 +1916,7 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+  genWsloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
 }
 
 static void
@@ -1939,8 +1924,7 @@ genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter,
                          Fortran::lower::SymMap &symTable,
                          Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
-                         mlir::Location loc, const List<Clause> &clauses,
-                         const ConstructQueue &queue,
+                         mlir::Location loc, const ConstructQueue &queue,
                          ConstructQueue::iterator item) {
   TODO(loc, "Composite TASKLOOP SIMD");
 }
@@ -1956,18 +1940,16 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
                            mlir::Location loc, const ConstructQueue &queue,
                            ConstructQueue::iterator item) {
   assert(item != queue.end());
-  const List<Clause> &clauses = item->clauses;
 
   switch (llvm::omp::Directive dir = item->id) {
   case llvm::omp::Directive::OMPD_barrier:
     genBarrierOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute:
-    genDistributeOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                    item);
+    genDistributeOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_do:
-    genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genWsloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_loop:
   case llvm::omp::Directive::OMPD_masked:
@@ -1975,71 +1957,64 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
                   llvm::omp::getOpenMPDirectiveName(dir) + ")");
     break;
   case llvm::omp::Directive::OMPD_master:
-    genMasterOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genMasterOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_ordered:
     // Block-associated "ordered" construct.
-    genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                       item);
+    genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+    genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item,
                   /*outerCombined=*/false);
     break;
   case llvm::omp::Directive::OMPD_section:
-    genSectionOp(converter, symTable, semaCtx, eval, loc, /*clauses=*/{}, queue,
-                 item);
+    genSectionOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_sections:
-    genSectionsOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_simd:
-    genSimdOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSimdOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_single:
-    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+    genTargetOp(converter, symTable, semaCtx, eval, loc, queue, item,
                 /*outerCombined=*/false);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                    item);
+    genTargetDataOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_task:
-    genTaskOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genTaskOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskgroup:
-    genTaskgroupOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                   item);
+    genTaskgroupOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskloop:
-    genTaskloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genTaskloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    genTaskwaitOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genTaskwaitOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskyield:
     genTaskyieldOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_teams:
-    genTeamsOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genTeamsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_tile:
   case llvm::omp::Directive::OMPD_unroll:
@@ -2050,29 +2025,28 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
     // FIXME: Workshare is not a commonly used OpenMP construct, an
     // implementation for this feature will come later. For the codes
     // that use this construct, add a single construct for now.
-    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
 
   // Composite constructs
   case llvm::omp::Directive::OMPD_distribute_parallel_do:
     genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval, loc,
-                                     clauses, queue, item);
+                                     queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
     genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
-                                         loc, clauses, queue, item);
+                                         loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute_simd:
-    genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, clauses,
-                               queue, item);
+    genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, queue,
+                               item);
     break;
   case llvm::omp::Directive::OMPD_do_simd:
-    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                       item);
+    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskloop_simd:
-    genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, clauses,
-                             queue, item);
+    genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, queue,
+                             item);
     break;
   default:
     break;
@@ -2194,8 +2168,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
                           eval, directive.source, directive.v, clauses)};
   if (directive.v == llvm::omp::Directive::OMPD_ordered) {
     // Standalone "ordered" directive.
-    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
-                 queue, queue.begin());
+    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
   } else {
     // Dispatch handles the "block-associated" variant of "ordered".
     genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
@@ -2227,7 +2201,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       converter.getFirOpBuilder().getModule(), semaCtx, eval, verbatim.source,
       llvm::omp::Directive::OMPD_flush, clauses)};
   genFlushOp(converter, symTable, semaCtx, eval, currentLocation, objects,
-             clauses, queue, queue.begin());
+             queue, queue.begin());
 }
 
 static void
@@ -2399,8 +2373,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
   mlir::Location currentLocation = converter.getCurrentLocation();
-  genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
-                queue, queue.begin(), name);
+  genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                queue.begin(), name);
 }
 
 static void
diff --git a/flang/test/Lower/HLFIR/forall.f90 b/flang/test/Lower/HLFIR/forall.f90
index 9941ed19401038..c12f0c6a826b50 100644
--- a/flang/test/Lower/HLFIR/forall.f90
+++ b/flang/test/Lower/HLFIR/forall.f90
@@ -144,7 +144,7 @@ subroutine test_nested_foralls()
     ! ifoo and ibar could depend on x since it is a module
     ! variable use associated. The calls in the control value
     ! computation cannot be hoisted from the outer forall
-    ! even when they do not depend on outer forall indicies.
+    ! even when they do not depend on outer forall indices.
     forall (integer(8)::j=jfoo():jbar())
       x(i, j) = x(j, i)
     end forall
diff --git a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
new file mode 100644
index 00000000000000..53871276761fa4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
@@ -0,0 +1,15 @@
+!Remove the --crash below once we can diagnose the issue more gracefully.
+!REQUIRES: asserts
+!RUN: not --crash %flang_fc1 -fopenmp -emit-hlfir -o - %s
+
+! Check that we reject the "task" reduction modifier on the "simd" directive.
+
+subroutine fred(x)
+  integer, intent(inout) :: x
+
+  !$omp simd reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
diff --git a/flang/test/Lower/OpenMP/parallel-sections.f90 b/flang/test/Lower/OpenMP/parallel-sections.f90
index 2f78dd4562b0ae..285102e06cad1d 100644
--- a/flang/test/Lower/OpenMP/parallel-sections.f90
+++ b/flang/test/Lower/OpenMP/parallel-sections.f90
@@ -39,13 +39,10 @@ end subroutine omp_parallel_sections
 subroutine omp_parallel_sections_allocate(x, y)
   use omp_lib
   integer, intent(inout) :: x, y
+  !CHECK: omp.parallel
   !CHECK: %[[allocator_1:.*]] = arith.constant 4 : i64
-  !CHECK: %[[allocator_2:.*]] = arith.constant 4 : i64
-  !CHECK: omp.parallel allocate(
-  !CHECK: %[[allocator_2]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !CHECK: omp.sections allocate(
-  !CHECK: %[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x)
+  !CHECK: omp.sections allocate(%[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
+  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x) private(x, y)
     !CHECK: omp.section {
     !$omp section
       x = x + 12
diff --git a/flang/test/Lower/OpenMP/taskgroup.f90 b/flang/test/Lower/OpenMP/taskgroup.f90
index 76458f1f1127f3..d9d262bdd2c083 100644
--- a/flang/test/Lower/OpenMP/taskgroup.f90
+++ b/flang/test/Lower/OpenMP/taskgroup.f90
@@ -1,17 +1,14 @@
-! REQUIRES: openmp_runtime
-
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
+! The "allocate" clause has been removed, because it needs to be used
+! together with a privatizing clause. The only such clause for "taskgroup"
+! is "task_reduction", but it's not yet supported.
+
 !CHECK-LABEL: @_QPomp_taskgroup
 subroutine omp_taskgroup
-use omp_lib
-integer :: allocated_x
-!CHECK: %[[ALLOC_X_REF:.*]] = fir.alloca i32 {bindc_name = "allocated_x", uniq_name = "_QFomp_taskgroupEallocated_x"}
-!CHECK-NEXT: %[[ALLOC_X_DECL:.*]]:2 = hlfir.declare %[[ALLOC_X_REF]] {uniq_name = "_QFomp_taskgroupEallocated_x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[C4:.*]] = arith.constant 4 : i64
-
-!CHECK: omp.taskgroup  allocate(%[[C4]] : i64 -> %[[ALLOC_X_DECL]]#1 : !fir.ref<i32>)
-!$omp taskgroup allocate(omp_high_bw_mem_alloc: allocated_x)
+!CHECK: omp.taskgroup
+!$omp taskgroup
+!CHECK: omp.task
 !$omp task
 !CHECK: fir.call @_QPwork() {{.*}}: () -> ()
    call work()
diff --git a/flang/test/Lower/OpenMP/teams.f90 b/flang/test/Lower/OpenMP/teams.f90
index f122a578a6e160..b1b2e7080676e3 100644
--- a/flang/test/Lower/OpenMP/teams.f90
+++ b/flang/test/Lower/OpenMP/teams.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPteams_simple
 subroutine teams_simple()
diff --git a/libc/src/__support/StringUtil/tables/stdc_errors.h b/libc/src/__support/StringUtil/tables/stdc_errors.h
index a9c1527834550f..6873d6bd51074b 100644
--- a/libc/src/__support/StringUtil/tables/stdc_errors.h
+++ b/libc/src/__support/StringUtil/tables/stdc_errors.h
@@ -15,11 +15,10 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE_VAR constexpr const MsgTable<4> STDC_ERRORS = {
+LIBC_INLINE_VAR constexpr const MsgTable<3> STDC_ERRORS = {
     MsgMapping(0, "Success"),
     MsgMapping(EDOM, "Numerical argument out of domain"),
     MsgMapping(ERANGE, "Numerical result out of range"),
-    MsgMapping(EILSEQ, "Invalid or incomplete multibyte or wide character"),
 };
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index eda978a83ea8af..b9a8f303dd6738 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -496,7 +496,7 @@ template <typename ArgProvider> class Parser {
   // the type of index, and returns a TypeDesc describing that type. It does not
   // modify cur_pos.
   LIBC_INLINE TypeDesc get_type_desc(size_t index) {
-    // index mode is assumed, and the indicies start at 1, so an index
+    // index mode is assumed, and the indices start at 1, so an index
     // of 0 is invalid.
     size_t local_pos = 0;
 
diff --git a/libc/test/src/string/strerror_test.cpp b/libc/test/src/string/strerror_test.cpp
index ec9827b75cfc82..cfc79481699bc8 100644
--- a/libc/test/src/string/strerror_test.cpp
+++ b/libc/test/src/string/strerror_test.cpp
@@ -97,7 +97,7 @@ TEST(LlvmLibcStrErrorTest, KnownErrors) {
       ".lib section in a.out corrupted",
       "Attempting to link in too many shared libraries",
       "Cannot exec a shared library directly",
-      "Invalid or incomplete multibyte or wide character",
+      "Unknown error 84", // Unknown
       "Interrupted system call should be restarted",
       "Streams pipe error",
       "Too many users",
diff --git a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
index 3ac142cce3a348..588a5e9774a553 100644
--- a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
+++ b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
@@ -29,7 +29,7 @@
 #include <utility>
 
 // Layout that wraps indices to test some idiosyncratic behavior
-// - basically it is a layout_left where indicies are first wrapped i.e. i%Wrap
+// - basically it is a layout_left where indices are first wrapped i.e. i%Wrap
 // - only accepts integers as indices
 // - is_always_strided and is_always_unique are false
 // - is_strided and is_unique are true if all extents are smaller than Wrap
diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s
index e7074141f0113f..dbb9f1df275719 100644
--- a/lld/test/MachO/objc.s
+++ b/lld/test/MachO/objc.s
@@ -5,12 +5,14 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-category.s -o %t/has-objc-category.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-symbol-and-category.s -o %t/has-objc-symbol-and-category.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift.s -o %t/has-swift.o
+# RUN: llvm-as %t/has-swift-ir-loaded.ll -o %t/has-swift-ir-loaded.o
+# RUN: llvm-as %t/has-swift-ir-not-loaded.ll -o %t/has-swift-ir-not-loaded.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift-proto.s -o %t/has-swift-proto.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-objc.s -o %t/no-objc.o
 ## Make sure we don't mis-parse a 32-bit file as 64-bit
 # RUN: llvm-mc -filetype=obj -triple=armv7-apple-watchos %t/no-objc.s -o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
 
@@ -20,7 +22,7 @@
 # RUN: %lld -lSystem %t/test.o -o %t/test -L%t -lHasSomeObjC2 -ObjC
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
-# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
+# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
 # RUN:     | FileCheck -check-prefix=WARNING %s
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
@@ -36,6 +38,7 @@
 # OBJC-NEXT:    4 has_objc_symbol {{.*}}      DATA
 # OBJC-EMPTY:
 # OBJC-NEXT:  SYMBOL TABLE:
+# OBJC-DAG:   g     O __TEXT,__swift _foo
 # OBJC-DAG:   g     F __TEXT,__text _main
 # OBJC-DAG:   g     F __TEXT,__text _OBJC_CLASS_$_MyObject
 # OBJC-DAG:   g     O __TEXT,__swift5_fieldmd $s7somelib4Blah_pMF
@@ -100,6 +103,20 @@ _has_dup:
 .section __TEXT,__swift
 .quad 0x1234
 
+#--- has-swift-ir-loaded.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@foo = global i64 1234, section "__TEXT,__swift"
+@llvm.used = appending global [1 x ptr] [ptr @foo]
+
+#--- has-swift-ir-not-loaded.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@bar = global i64 1234
+@llvm.used = appending global [1 x ptr] [ptr @bar]
+
 #--- has-swift-proto.s
 .section __TEXT,__swift5_fieldmd
 .globl $s7somelib4Blah_pMF
diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index 0d6907e0c3d229..63842487fc338d 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -107,9 +107,16 @@ def test_process_unload(self):
             self, "// Break here", lldb.SBFileSpec("main.cpp")
         )
         err = lldb.SBError()
-        self.process().LoadImage(
-            lldb.SBFileSpec(self.getBuildArtifact("libshared.so")), err
+        local_spec = lldb.SBFileSpec(self.getBuildArtifact("libshared.so"))
+        remote_spec = (
+            lldb.SBFileSpec(
+                lldbutil.append_to_process_working_directory(self, "libshared.so"),
+                False,
+            )
+            if lldb.remote_platform
+            else lldb.SBFileSpec()
         )
+        self.process().LoadImage(local_spec, remote_spec, err)
         self.assertSuccess(err)
 
         self.complete_from_to("process unload ", "process unload 0")
@@ -473,7 +480,7 @@ def test_custom_command_completion(self):
         self.complete_from_to("my_test_cmd main.cp", ["main.cpp"])
         self.expect("my_test_cmd main.cpp", substrs=["main.cpp"])
 
-    @skipIfWindows
+    @skipIf(hostoslist=["windows"])
     def test_completion_target_create_from_root_dir(self):
         """Tests source file completion by completing ."""
         root_dir = os.path.abspath(os.sep)
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
index 2be5ae3132038d..c902722a2f74bb 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
@@ -147,7 +147,9 @@ def vFile(self, packet):
             log=server2.responder.packetLog,
         )
 
-    @skipIfWindows
+    @expectedFailureAll(
+        hostoslist=["windows"], bugnumber="github.com/llvm/llvm-project/issues/92255"
+    )
     def test_file_permissions(self):
         """Test 'platform get-permissions'"""
 
@@ -167,7 +169,9 @@ def vFile(self, packet):
             ]
         )
 
-    @skipIfWindows
+    @expectedFailureAll(
+        hostoslist=["windows"], bugnumber="github.com/llvm/llvm-project/issues/92255"
+    )
     def test_file_permissions_fallback(self):
         """Test 'platform get-permissions' fallback to fstat"""
 
diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
index 756f4d1e81caa0..ff1ef21e02e319 100644
--- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
+++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
@@ -147,12 +147,12 @@ def check_stack(self, process, pid, filename):
         self.check_backtrace(thread, filename, backtrace)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_single_threaded(self):
         """Test single-threaded aarch64 core dump."""
         self.do_test("1lwp_SIGSEGV.aarch64", pid=8339, region_count=32)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_single_threaded(self):
         """Test single-threaded amd64 core dump."""
         self.do_test("1lwp_SIGSEGV.amd64", pid=693, region_count=21)
 
@@ -177,12 +177,12 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), 0)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_thread_signaled(self):
         """Test double-threaded aarch64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.aarch64", pid=14142, region_count=31)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_thread_signaled(self):
         """Test double-threaded amd64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.amd64", pid=622, region_count=24)
 
@@ -207,11 +207,11 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), signal.SIGSEGV)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_process_signaled(self):
         """Test double-threaded aarch64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.aarch64", pid=1403, region_count=30)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_process_signaled(self):
         """Test double-threaded amd64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.amd64", pid=665, region_count=24)
diff --git a/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py b/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
index 3c07554f6cafd7..bbc2dcbe4e30ad 100644
--- a/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
+++ b/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
@@ -18,3 +18,4 @@ def test_resize_no_editline(self):
         dbg.RunCommandInterpreter(True, True, opts, 0, False, False)
         # Try resizing the terminal which shouldn't crash.
         dbg.SetTerminalWidth(47)
+        dbg.GetInputFile().Close()
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
index 04d6abe9d88c15..58373d2f85bb99 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
@@ -26,6 +26,7 @@ def init_lldb_server(self):
     def generate_hex_path(self, target):
         return str(os.path.join(self.getBuildDir(), target)).encode().hex()
 
+    @skipIfRemote
     @add_test_categories(["llgs"])
     def test_autocomplete_path(self):
         self.build()
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 274b1519208a16..16ce4672be71c3 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -46,6 +46,7 @@ Installing the plug-in is very straightforward and involves just a few steps.
 
 ```bash
 cd /path/to/lldb/tools/lldb-dap
+npm install
 npm run package # This also compiles the extension.
 npm run vscode-install
 ```
@@ -69,6 +70,7 @@ no effect.
 ```bash
 # Bump version in package.json
 cd /path/to/lldb/tools/lldb-dap
+npm install
 npm run package
 npm run vscode-install
 ```
diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst
index 83369d93c309a7..36092325e536fb 100644
--- a/llvm/docs/Coroutines.rst
+++ b/llvm/docs/Coroutines.rst
@@ -1922,7 +1922,7 @@ Example:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ::
 
-  declare ptr @llvm.coro.await.suspend.handle(
+  declare void @llvm.coro.await.suspend.handle(
                 ptr <awaiter>,
                 ptr <handle>,
                 ptr <await_suspend_function>)
@@ -1967,7 +1967,9 @@ The intrinsic must be used between corresponding `coro.save`_ and
 `await_suspend_function` call during `CoroSplit`_ pass.
 
 `await_suspend_function` must return a pointer to a valid
-coroutine frame, which is immediately resumed
+coroutine frame. The intrinsic will be lowered to a tail call resuming the
+returned coroutine frame. It will be marked `musttail` on targets that support
+that. Instructions following the intrinsic will become unreachable.
 
 Example:
 """"""""
@@ -1977,11 +1979,10 @@ Example:
   ; before lowering
   await.suspend:
     %save = call token @llvm.coro.save(ptr %hdl)
-    %next = call ptr @llvm.coro.await.suspend.handle(
-                ptr %awaiter,
-                ptr %hdl,
-                ptr @await_suspend_function)
-    call void @llvm.coro.resume(%next)
+    call void @llvm.coro.await.suspend.handle(
+        ptr %awaiter,
+        ptr %hdl,
+        ptr @await_suspend_function)
     %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
     ...
 
@@ -1992,8 +1993,8 @@ Example:
     %next = call ptr @await_suspend_function(
                 ptr %awaiter,
                 ptr %hdl)
-    call void @llvm.coro.resume(%next)
-    %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+    musttail call void @llvm.coro.resume(%next)
+    ret void
     ...
 
   ; wrapper function example
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 52dc039df7779e..5c28c6fcd30fb6 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -644,7 +644,7 @@ source vector should be inserted into.
 The index must be a constant multiple of the second source vector's minimum
 vector length. If the vectors are scalable, then the index is first scaled by
 the runtime scaling factor. The indices inserted in the source vector must be
-valid indicies of that vector. If this condition cannot be determined statically
+valid indices of that vector. If this condition cannot be determined statically
 but is false at runtime, then the result vector is undefined.
 
 .. code-block:: none
@@ -661,7 +661,7 @@ the source vector.
 The index must be a constant multiple of the source vector's minimum vector
 length. If the source vector is a scalable vector, then the index is first
 scaled by the runtime scaling factor. The indices extracted from the source
-vector must be valid indicies of that vector. If this condition cannot be
+vector must be valid indices of that vector. If this condition cannot be
 determined statically but is false at runtime, then the result vector is
 undefined.
 
diff --git a/llvm/examples/ExceptionDemo/CMakeLists.txt b/llvm/examples/ExceptionDemo/CMakeLists.txt
index 0a60ad848dd406..793cf291ca6f11 100644
--- a/llvm/examples/ExceptionDemo/CMakeLists.txt
+++ b/llvm/examples/ExceptionDemo/CMakeLists.txt
@@ -1,7 +1,9 @@
 set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
-  ORCJIT
+  MC
+  MCJIT
+  RuntimeDyld
   Support
   Target
   nativecodegen
diff --git a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
index 41fa0cf626bf35..0afc6b30d140e9 100644
--- a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -49,9 +49,8 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -85,8 +84,6 @@
 #define USE_GLOBAL_STR_CONSTS true
 #endif
 
-llvm::ExitOnError ExitOnErr;
-
 //
 // Example types
 //
@@ -145,7 +142,6 @@ static llvm::ConstantInt *ourExceptionCaughtState;
 
 typedef std::vector<std::string> ArgNames;
 typedef std::vector<llvm::Type*> ArgTypes;
-typedef llvm::ArrayRef<llvm::Type *> TypeArray;
 
 //
 // Code Generation Utilities
@@ -896,10 +892,13 @@ void generateStringPrint(llvm::LLVMContext &context,
 ///        generated, and is used to hold the constant string. A value of
 ///        false indicates that the constant string will be stored on the
 ///        stack.
-void generateIntegerPrint(llvm::LLVMContext &context, llvm::Module &module,
+void generateIntegerPrint(llvm::LLVMContext &context,
+                          llvm::Module &module,
                           llvm::IRBuilder<> &builder,
-                          llvm::Function &printFunct, llvm::Value *toPrint,
-                          std::string format, bool useGlobal = true) {
+                          llvm::Function &printFunct,
+                          llvm::Value &toPrint,
+                          std::string format,
+                          bool useGlobal = true) {
   llvm::Constant *stringConstant =
     llvm::ConstantDataArray::getString(context, format);
   llvm::Value *stringVar;
@@ -921,9 +920,10 @@ void generateIntegerPrint(llvm::LLVMContext &context, llvm::Module &module,
 
   llvm::Value *cast = builder.CreateBitCast(stringVar,
                                             builder.getPtrTy());
-  builder.CreateCall(&printFunct, {toPrint, cast});
+  builder.CreateCall(&printFunct, {&toPrint, cast});
 }
 
+
 /// Generates code to handle finally block type semantics: always runs
 /// regardless of whether a thrown exception is passing through or the
 /// parent function is simply exiting. In addition to printing some state
@@ -997,10 +997,10 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
                       bufferToPrint.str(),
                       USE_GLOBAL_STR_CONSTS);
 
-  llvm::SwitchInst *theSwitch = builder.CreateSwitch(
-      builder.CreateLoad(ourExceptionNotThrownState->getType(),
-                         *exceptionCaughtFlag),
-      &terminatorBlock, 2);
+  llvm::SwitchInst *theSwitch = builder.CreateSwitch(builder.CreateLoad(
+                                                       *exceptionCaughtFlag),
+                                                     &terminatorBlock,
+                                                     2);
   theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock);
   theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock);
 
@@ -1186,7 +1186,7 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   // Note: function handles NULL exceptions
   builder.CreateCall(deleteOurException,
-                     builder.CreateLoad(builder.getPtrTy(), exceptionStorage));
+                     builder.CreateLoad(exceptionStorage));
   builder.CreateRetVoid();
 
   // Normal Block
@@ -1206,8 +1206,7 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   builder.SetInsertPoint(unwindResumeBlock);
 
-  builder.CreateResume(
-      builder.CreateLoad(ourCaughtResultType, caughtResultStorage));
+  builder.CreateResume(builder.CreateLoad(caughtResultStorage));
 
   // Exception Block
 
@@ -1242,9 +1241,8 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   // Retrieve exception_class member from thrown exception
   // (_Unwind_Exception instance). This member tells us whether or not
   // the exception is foreign.
-  llvm::Value *unwindExceptionClass = builder.CreateLoad(
-      builder.getInt64Ty(),
-      builder.CreateStructGEP(
+  llvm::Value *unwindExceptionClass =
+      builder.CreateLoad(builder.CreateStructGEP(
           ourUnwindExceptionType,
           builder.CreatePointerCast(unwindException,
                                     ourUnwindExceptionType->getPointerTo()),
@@ -1280,9 +1278,9 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   //
   // Note: ourBaseFromUnwindOffset is usually negative
   llvm::Value *typeInfoThrown = builder.CreatePointerCast(
-      builder.CreateConstGEP1_64(builder.getPtrTy(), unwindException,
-                                 ourBaseFromUnwindOffset),
-      ourExceptionType->getPointerTo());
+                                  builder.CreateConstGEP1_64(unwindException,
+                                                       ourBaseFromUnwindOffset),
+                                  ourExceptionType->getPointerTo());
 
   // Retrieve thrown exception type info type
   //
@@ -1291,15 +1289,17 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   typeInfoThrown = builder.CreateStructGEP(ourExceptionType, typeInfoThrown, 0);
 
   llvm::Value *typeInfoThrownType =
-      builder.CreateStructGEP(ourTypeInfoType, typeInfoThrown, 0);
+      builder.CreateStructGEP(builder.getPtrTy(), typeInfoThrown, 0);
 
-  llvm::Value *ti8 =
-      builder.CreateLoad(builder.getInt8Ty(), typeInfoThrownType);
-  generateIntegerPrint(context, module, builder, *toPrint32Int,
-                       builder.CreateZExt(ti8, builder.getInt32Ty()),
+  generateIntegerPrint(context,
+                       module,
+                       builder,
+                       *toPrint32Int,
+                       *(builder.CreateLoad(typeInfoThrownType)),
                        "Gen: Exception type <%d> received (stack unwound) "
                        " in " +
-                           ourId + ".\n",
+                       ourId +
+                       ".\n",
                        USE_GLOBAL_STR_CONSTS);
 
   // Route to matched type info catch block or run cleanup finally block
@@ -1311,7 +1311,8 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   for (unsigned i = 1; i <= numExceptionsToCatch; ++i) {
     nextTypeToCatch = i - 1;
-    switchToCatchBlock->addCase(llvm::ConstantInt::get(builder.getInt32Ty(), i),
+    switchToCatchBlock->addCase(llvm::ConstantInt::get(
+                                   llvm::Type::getInt32Ty(context), i),
                                 catchBlocks[nextTypeToCatch]);
   }
 
@@ -1386,10 +1387,14 @@ createThrowExceptionFunction(llvm::Module &module, llvm::IRBuilder<> &builder,
   builder.SetInsertPoint(entryBlock);
 
   llvm::Function *toPrint32Int = module.getFunction("print32Int");
-  generateIntegerPrint(context, module, builder, *toPrint32Int,
-                       builder.CreateZExt(exceptionType, builder.getInt32Ty()),
-                       "\nGen: About to throw exception type <%d> in " + ourId +
-                           ".\n",
+  generateIntegerPrint(context,
+                       module,
+                       builder,
+                       *toPrint32Int,
+                       *exceptionType,
+                       "\nGen: About to throw exception type <%d> in " +
+                       ourId +
+                       ".\n",
                        USE_GLOBAL_STR_CONSTS);
 
   // Switches on runtime type info type value to determine whether or not
@@ -1541,13 +1546,15 @@ typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 /// @param function generated test function to run
 /// @param typeToThrow type info type of generated exception to throw, or
 ///        indicator to cause foreign exception to be thrown.
-static void runExceptionThrow(llvm::orc::LLJIT *JIT, std::string function,
-                              int32_t typeToThrow) {
+static
+void runExceptionThrow(llvm::ExecutionEngine *engine,
+                       llvm::Function *function,
+                       int32_t typeToThrow) {
 
   // Find test's function pointer
   OurExceptionThrowFunctType functPtr =
-      reinterpret_cast<OurExceptionThrowFunctType>(reinterpret_cast<uintptr_t>(
-          ExitOnErr(JIT->lookup(function)).getValue()));
+    reinterpret_cast<OurExceptionThrowFunctType>(
+       reinterpret_cast<intptr_t>(engine->getPointerToFunction(function)));
 
   try {
     // Run test
@@ -1576,6 +1583,8 @@ static void runExceptionThrow(llvm::orc::LLJIT *JIT, std::string function,
 // End test functions
 //
 
+typedef llvm::ArrayRef<llvm::Type*> TypeArray;
+
 /// This initialization routine creates type info globals and
 /// adds external function declarations to module.
 /// @param numTypeInfos number of linear type info associated type info types
@@ -1885,73 +1894,93 @@ int main(int argc, char *argv[]) {
     return(0);
   }
 
+  // If not set, exception handling will not be turned on
+  llvm::TargetOptions Opts;
+
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-  auto Context = std::make_unique<llvm::LLVMContext>();
-  llvm::IRBuilder<> theBuilder(*Context);
+  llvm::LLVMContext Context;
+  llvm::IRBuilder<> theBuilder(Context);
 
   // Make the module, which holds all the code.
   std::unique_ptr<llvm::Module> Owner =
-      std::make_unique<llvm::Module>("my cool jit", *Context);
+      std::make_unique<llvm::Module>("my cool jit", Context);
   llvm::Module *module = Owner.get();
 
-  // Build LLJIT
-  std::unique_ptr<llvm::orc::LLJIT> JIT =
-      ExitOnErr(llvm::orc::LLJITBuilder().create());
+  std::unique_ptr<llvm::RTDyldMemoryManager> MemMgr(new llvm::SectionMemoryManager());
 
-  // Set up the optimizer pipeline.
-  llvm::legacy::FunctionPassManager fpm(module);
+  // Build engine with JIT
+  llvm::EngineBuilder factory(std::move(Owner));
+  factory.setEngineKind(llvm::EngineKind::JIT);
+  factory.setTargetOptions(Opts);
+  factory.setMCJITMemoryManager(std::move(MemMgr));
+  llvm::ExecutionEngine *executionEngine = factory.create();
 
-  // Optimizations turned on
+  {
+    llvm::legacy::FunctionPassManager fpm(module);
+
+    // Set up the optimizer pipeline.
+    // Start with registering info about how the
+    // target lays out data structures.
+    module->setDataLayout(executionEngine->getDataLayout());
+
+    // Optimizations turned on
 #ifdef ADD_OPT_PASSES
 
-  // Basic AliasAnslysis support for GVN.
-  fpm.add(llvm::createBasicAliasAnalysisPass());
+    // Basic AliasAnslysis support for GVN.
+    fpm.add(llvm::createBasicAliasAnalysisPass());
 
-  // Promote allocas to registers.
-  fpm.add(llvm::createPromoteMemoryToRegisterPass());
+    // Promote allocas to registers.
+    fpm.add(llvm::createPromoteMemoryToRegisterPass());
 
-  // Do simple "peephole" optimizations and bit-twiddling optzns.
-  fpm.add(llvm::createInstructionCombiningPass());
+    // Do simple "peephole" optimizations and bit-twiddling optzns.
+    fpm.add(llvm::createInstructionCombiningPass());
 
-  // Reassociate expressions.
-  fpm.add(llvm::createReassociatePass());
+    // Reassociate expressions.
+    fpm.add(llvm::createReassociatePass());
 
-  // Eliminate Common SubExpressions.
-  fpm.add(llvm::createGVNPass());
+    // Eliminate Common SubExpressions.
+    fpm.add(llvm::createGVNPass());
 
-  // Simplify the control flow graph (deleting unreachable
-  // blocks, etc).
-  fpm.add(llvm::createCFGSimplificationPass());
+    // Simplify the control flow graph (deleting unreachable
+    // blocks, etc).
+    fpm.add(llvm::createCFGSimplificationPass());
 #endif  // ADD_OPT_PASSES
 
-  fpm.doInitialization();
+    fpm.doInitialization();
 
-  // Generate test code using function throwCppException(...) as
-  // the function which throws foreign exceptions.
-  createUnwindExceptionTest(*module, theBuilder, fpm, "throwCppException");
+    // Generate test code using function throwCppException(...) as
+    // the function which throws foreign exceptions.
+    llvm::Function *toRun =
+    createUnwindExceptionTest(*module,
+                              theBuilder,
+                              fpm,
+                              "throwCppException");
 
-  ExitOnErr(JIT->addIRModule(
-      llvm::orc::ThreadSafeModule(std::move(Owner), std::move(Context))));
+    executionEngine->finalizeObject();
 
 #ifndef NDEBUG
-  fprintf(stderr, "\nBegin module dump:\n\n");
+    fprintf(stderr, "\nBegin module dump:\n\n");
 
-  module->print(llvm::errs(), nullptr);
+    module->dump();
 
-  fprintf(stderr, "\nEnd module dump:\n");
+    fprintf(stderr, "\nEnd module dump:\n");
 #endif
 
-  fprintf(stderr, "\n\nBegin Test:\n");
-  std::string toRun = "outerCatchFunct";
+    fprintf(stderr, "\n\nBegin Test:\n");
+
+    for (int i = 1; i < argc; ++i) {
+      // Run test for each argument whose value is the exception
+      // type to throw.
+      runExceptionThrow(executionEngine,
+                        toRun,
+                        (unsigned) strtoul(argv[i], NULL, 10));
+    }
 
-  for (int i = 1; i < argc; ++i) {
-    // Run test for each argument whose value is the exception
-    // type to throw.
-    runExceptionThrow(JIT.get(), toRun, (unsigned)strtoul(argv[i], NULL, 10));
+    fprintf(stderr, "\nEnd Test:\n\n");
   }
 
-  fprintf(stderr, "\nEnd Test:\n\n");
+  delete executionEngine;
 
   return 0;
 }
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
index 3f7f8c7838fd16..efc8db12a69720 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
@@ -246,20 +246,17 @@ class LVObject {
   virtual void setName(StringRef ObjectName) {}
 
   LVElement *getParent() const {
-    assert((!Parent.Element ||
-            (Parent.Element && static_cast<LVElement *>(Parent.Element))) &&
+    assert((!Parent.Element || static_cast<LVElement *>(Parent.Element)) &&
            "Invalid element");
     return Parent.Element;
   }
   LVScope *getParentScope() const {
-    assert((!Parent.Scope ||
-            (Parent.Scope && static_cast<LVScope *>(Parent.Scope))) &&
+    assert((!Parent.Scope || static_cast<LVScope *>(Parent.Scope)) &&
            "Invalid scope");
     return Parent.Scope;
   }
   LVSymbol *getParentSymbol() const {
-    assert((!Parent.Symbol ||
-            (Parent.Symbol && static_cast<LVSymbol *>(Parent.Symbol))) &&
+    assert((!Parent.Symbol || static_cast<LVSymbol *>(Parent.Symbol)) &&
            "Invalid symbol");
     return Parent.Symbol;
   }
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
index 9dcb115a0c514b..f6ee963bd88559 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
@@ -343,13 +343,31 @@ template <typename C> void ConstructCompositionT<C>::mergeDSA() {
     }
   }
 
-  // Check reductions as well, clear "shared" if set.
+  // Check other privatizing clauses as well, clear "shared" if set.
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_in_reduction]) {
+    using InReductionTy = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename InReductionTy::List;
+    for (auto &object : std::get<ListTy>(std::get<InReductionTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_linear]) {
+    using LinearTy = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename LinearTy::List;
+    for (auto &object : std::get<ListTy>(std::get<LinearTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
   for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_reduction]) {
     using ReductionTy = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
     using ListTy = typename ReductionTy::List;
     for (auto &object : std::get<ListTy>(std::get<ReductionTy>(clause.u).t))
       getDsa(object).second &= ~DSA::Shared;
   }
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_task_reduction]) {
+    using TaskReductionTy = tomp::clause::TaskReductionT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename TaskReductionTy::List;
+    for (auto &object : std::get<ListTy>(std::get<TaskReductionTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
 
   tomp::ListT<ObjectTy> privateObj, sharedObj, firstpObj, lastpObj, lastpcObj;
   for (auto &[object, dsa] : objectDsa) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 5f12c62b832fc4..3fa27608ead948 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -793,9 +793,14 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // [5.2:340:33]
   auto canMakePrivateCopy = [](llvm::omp::Clause id) {
     switch (id) {
+    // Clauses with "privatization" property:
     case llvm::omp::Clause::OMPC_firstprivate:
+    case llvm::omp::Clause::OMPC_in_reduction:
     case llvm::omp::Clause::OMPC_lastprivate:
+    case llvm::omp::Clause::OMPC_linear:
     case llvm::omp::Clause::OMPC_private:
+    case llvm::omp::Clause::OMPC_reduction:
+    case llvm::omp::Clause::OMPC_task_reduction:
       return true;
     default:
       return false;
@@ -930,7 +935,8 @@ bool ConstructDecompositionT<C, H>::applyClause(
       // Apply clause without modifier.
       leaf.clauses.push_back(unmodified);
     }
-    applied = true;
+    // The modifier must be applied to some construct.
+    applied = effectiveApplied;
   }
 
   if (!applied)
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index ea73808f33ff12..6e67fb81a72a1e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1723,7 +1723,7 @@ def int_coro_await_suspend_bool : Intrinsic<[llvm_i1_ty],
                                             [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
                                             [Throws]>;
 
-def int_coro_await_suspend_handle : Intrinsic<[llvm_ptr_ty],
+def int_coro_await_suspend_handle : Intrinsic<[],
                                               [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
                                               [Throws]>;
 
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 428e2d7741a412..b5e62ac3c187f2 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -245,8 +245,6 @@ class MCStreamer {
   /// requires.
   unsigned NextWinCFIID = 0;
 
-  bool UseAssemblerInfoForParsing;
-
   /// Is the assembler allowed to insert padding automatically?  For
   /// correctness reasons, we sometimes need to ensure instructions aren't
   /// separated in unexpected ways.  At the moment, this feature is only
@@ -296,11 +294,10 @@ class MCStreamer {
 
   MCContext &getContext() const { return Context; }
 
+  // MCObjectStreamer has an MCAssembler and allows more expression folding at
+  // parse time.
   virtual MCAssembler *getAssemblerPtr() { return nullptr; }
 
-  void setUseAssemblerInfoForParsing(bool v) { UseAssemblerInfoForParsing = v; }
-  bool getUseAssemblerInfoForParsing() { return UseAssemblerInfoForParsing; }
-
   MCTargetStreamer *getTargetStreamer() {
     return TargetStreamer.get();
   }
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 894b6484336aef..217130ce293af9 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -493,7 +493,7 @@ template <class T> class [[nodiscard]] Expected {
 
 public:
   /// Create an Expected<T> error value from the given Error.
-  Expected(Error Err)
+  Expected(Error &&Err)
       : HasError(true)
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
         // Expected is unchecked upon construction in Debug builds.
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index e46b9c67aa86e9..9d3c607471a6e3 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -765,8 +765,8 @@ class Instruction : InstructionEncoding {
 
   /// Should generate helper functions that help you to map a logical operand's
   /// index to the underlying MIOperand's index.
-  /// In most architectures logical operand indicies are equal to
-  /// MIOperand indicies, but for some CISC architectures, a logical operand
+  /// In most architectures logical operand indices are equal to
+  /// MIOperand indices, but for some CISC architectures, a logical operand
   /// might be consist of multiple MIOperand (e.g. a logical operand that
   /// uses complex address mode).
   bit UseLogicalOperandMappings = false;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 1bce9aae09bb26..e0e7dd18cd8d48 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3444,9 +3444,9 @@ bool DependenceInfo::tryDelinearizeFixedSize(
   // iff the subscripts are positive and are less than the range of the
   // dimension.
   if (!DisableDelinearizationChecks) {
-    auto AllIndiciesInRange = [&](SmallVector<int, 4> &DimensionSizes,
-                                  SmallVectorImpl<const SCEV *> &Subscripts,
-                                  Value *Ptr) {
+    auto AllIndicesInRange = [&](SmallVector<int, 4> &DimensionSizes,
+                                 SmallVectorImpl<const SCEV *> &Subscripts,
+                                 Value *Ptr) {
       size_t SSize = Subscripts.size();
       for (size_t I = 1; I < SSize; ++I) {
         const SCEV *S = Subscripts[I];
@@ -3462,8 +3462,8 @@ bool DependenceInfo::tryDelinearizeFixedSize(
       return true;
     };
 
-    if (!AllIndiciesInRange(SrcSizes, SrcSubscripts, SrcPtr) ||
-        !AllIndiciesInRange(DstSizes, DstSubscripts, DstPtr)) {
+    if (!AllIndicesInRange(SrcSizes, SrcSubscripts, SrcPtr) ||
+        !AllIndicesInRange(DstSizes, DstSubscripts, DstPtr)) {
       SrcSubscripts.clear();
       DstSubscripts.clear();
       return false;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 3cbb760c7811ae..592b67fdcd1b13 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -306,7 +306,8 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
         return error("Invalid section name record");
       // Check for the i386 and other (x86_64, ARM) conventions
       if (S.find("__DATA,__objc_catlist") != std::string::npos ||
-          S.find("__OBJC,__category") != std::string::npos)
+          S.find("__OBJC,__category") != std::string::npos ||
+          S.find("__TEXT,__swift") != std::string::npos)
         return true;
       break;
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index aff09c4eafd893..1a210ac5440006 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -994,7 +994,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
   SmallVector<uint64_t, 64> TypeVals;
 
-  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndices();
 
   // Abbrev for TYPE_CODE_OPAQUE_POINTER.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
@@ -3810,7 +3810,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
         CONSTANTS_SETTYPE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
@@ -3830,7 +3830,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
@@ -3852,7 +3852,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3904,7 +3904,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_CAST_ABBREV)
@@ -3915,7 +3915,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 44163a087e3061..811d33d68776e3 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -1211,6 +1211,6 @@ unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
   return getGlobalBasicBlockID(BB);
 }
 
-uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndices() const {
   return Log2_32_Ceil(getTypes().size() + 1);
 }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
index 4b45503595f6f4..8348d6728a5c31 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -234,7 +234,7 @@ class ValueEnumerator {
   void incorporateFunction(const Function &F);
 
   void purgeFunction();
-  uint64_t computeBitsRequiredForTypeIndicies() const;
+  uint64_t computeBitsRequiredForTypeIndices() const;
 
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
diff --git a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
index 3cc9dfdf7b8586..5b2c76350029be 100644
--- a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
+++ b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
@@ -167,7 +167,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
         if (Error Err =
                 JumpToBit(GetCurrentBitNo() + static_cast<uint64_t>(NumElts) *
                                                   EltEnc.getEncodingData()))
-          return std::move(Err);
+          return Err;
         break;
       case BitCodeAbbrevOp::VBR:
         assert((unsigned)EltEnc.getEncodingData() <= MaxChunkSize);
@@ -180,7 +180,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
         break;
       case BitCodeAbbrevOp::Char6:
         if (Error Err = JumpToBit(GetCurrentBitNo() + NumElts * 6))
-          return std::move(Err);
+          return Err;
         break;
       }
       continue;
@@ -206,7 +206,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
 
     // Skip over the blob.
     if (Error Err = JumpToBit(NewEnd))
-      return std::move(Err);
+      return Err;
   }
   return Code;
 }
@@ -344,7 +344,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     // over tail padding first, in case jumping to NewEnd invalidates the Blob
     // pointer.
     if (Error Err = JumpToBit(NewEnd))
-      return std::move(Err);
+      return Err;
     const char *Ptr = (const char *)getPointerToBit(CurBitPos, NumElts);
 
     // If we can return a reference to the data, do so to avoid copying it.
@@ -421,7 +421,7 @@ Error BitstreamCursor::ReadAbbrevRecord() {
 Expected<std::optional<BitstreamBlockInfo>>
 BitstreamCursor::ReadBlockInfoBlock(bool ReadBlockInfoNames) {
   if (llvm::Error Err = EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID))
-    return std::move(Err);
+    return Err;
 
   BitstreamBlockInfo NewBlockInfo;
 
@@ -452,7 +452,7 @@ BitstreamCursor::ReadBlockInfoBlock(bool ReadBlockInfoNames) {
       if (!CurBlockInfo)
         return std::nullopt;
       if (Error Err = ReadAbbrevRecord())
-        return std::move(Err);
+        return Err;
 
       // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
       // appropriate BlockInfo.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d0ef3e5a19391c..08e3c208ba4d38 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -102,9 +102,6 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
-  // Do not use assembler-level information for parsing inline assembly.
-  OutStreamer->setUseAssemblerInfoForParsing(false);
-
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index bf730be00a9a92..e146fb7e576819 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -86,7 +86,7 @@
 /// lookup the VarLoc in the VarLocMap. Rather than operate directly on machine
 /// locations, the dataflow analysis in this pass identifies locations by their
 /// indices in the VarLocMap, meaning all the variable locations in a block can
-/// be described by a sparse vector of VarLocMap indicies.
+/// be described by a sparse vector of VarLocMap indices.
 ///
 /// All the storage for the dataflow analysis is local to the ExtendRanges
 /// method and passed down to helper methods. "OutLocs" and "InLocs" record the
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 114e7910dc27bb..f3a961f883517f 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -212,7 +212,7 @@ static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
   M(float, mbb_frequencies, MBBFrequencyShape,                                 \
     "A vector of machine basic block frequencies")                             \
   M(int64_t, mbb_mapping, InstructionsShape,                                   \
-    "A vector of indicies mapping instructions to MBBs")
+    "A vector of indices mapping instructions to MBBs")
 #else
 #define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)
 #define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index dd56c0ae92fb8a..fccb536b568691 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -1486,7 +1486,7 @@ bool PEI::replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI,
   // pointer as the base register.
   if (MI.getOpcode() == TargetOpcode::STATEPOINT) {
     assert((!MI.isDebugValue() || OpIdx == 0) &&
-           "Frame indicies can only appear as the first operand of a "
+           "Frame indices can only appear as the first operand of a "
            "DBG_VALUE machine instruction");
     Register Reg;
     MachineOperand &Offset = MI.getOperand(OpIdx + 1);
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d2da5d0d3f90f2..a9003a164b306d 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -40,14 +40,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
 
 MCObjectStreamer::~MCObjectStreamer() = default;
 
-// AssemblerPtr is used for evaluation of expressions and causes
-// difference between asm and object outputs. Return nullptr to in
-// inline asm mode to limit divergence to assembly inputs.
-MCAssembler *MCObjectStreamer::getAssemblerPtr() {
-  if (getUseAssemblerInfoForParsing())
-    return Assembler.get();
-  return nullptr;
-}
+MCAssembler *MCObjectStreamer::getAssemblerPtr() { return Assembler.get(); }
 
 void MCObjectStreamer::addPendingLabel(MCSymbol* S) {
   MCSection *CurSection = getCurrentSectionOnly();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 062b6b41ac4aa1..80c1ce7b6def22 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -93,7 +93,7 @@ void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
     : Context(Ctx), CurrentWinFrameInfo(nullptr),
-      CurrentProcWinFrameInfoStartIndex(0), UseAssemblerInfoForParsing(false) {
+      CurrentProcWinFrameInfoStartIndex(0) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 18506f39f6b57b..5a85b8e00c633a 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -294,7 +294,7 @@ COFFObjectFile::getSectionContents(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
   ArrayRef<uint8_t> Res;
   if (Error E = getSectionContents(Sec, Res))
-    return std::move(E);
+    return E;
   return Res;
 }
 
@@ -807,7 +807,7 @@ Expected<std::unique_ptr<COFFObjectFile>>
 COFFObjectFile::create(MemoryBufferRef Object) {
   std::unique_ptr<COFFObjectFile> Obj(new COFFObjectFile(std::move(Object)));
   if (Error E = Obj->initialize())
-    return std::move(E);
+    return E;
   return std::move(Obj);
 }
 
@@ -1959,7 +1959,7 @@ ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
     uint64_t Offset = Entry.DataRVA + Sym->getValue();
     ArrayRef<uint8_t> Contents;
     if (Error E = Obj->getSectionContents(*Section, Contents))
-      return std::move(E);
+      return E;
     if (Offset + Entry.DataSize > Contents.size())
       return createStringError(object_error::parse_failed,
                                "data outside of section");
diff --git a/llvm/lib/Object/WindowsResource.cpp b/llvm/lib/Object/WindowsResource.cpp
index 983c8e30a9420d..306e8ec542068b 100644
--- a/llvm/lib/Object/WindowsResource.cpp
+++ b/llvm/lib/Object/WindowsResource.cpp
@@ -80,7 +80,7 @@ Expected<ResourceEntryRef>
 ResourceEntryRef::create(BinaryStreamRef BSR, const WindowsResource *Owner) {
   auto Ref = ResourceEntryRef(BSR, Owner);
   if (auto E = Ref.loadNext())
-    return std::move(E);
+    return E;
   return Ref;
 }
 
@@ -1006,7 +1006,7 @@ writeWindowsResourceCOFF(COFF::MachineTypes MachineType,
   Error E = Error::success();
   WindowsResourceCOFFWriter Writer(MachineType, Parser, E);
   if (E)
-    return std::move(E);
+    return E;
   return Writer.write(TimeDateStamp);
 }
 
diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp
index d3100c9ebb211b..26c3d54e17ade8 100644
--- a/llvm/lib/Support/ELFAttributeParser.cpp
+++ b/llvm/lib/Support/ELFAttributeParser.cpp
@@ -154,7 +154,7 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
                                    Twine::utohexstr(cursor.tell() - 5));
 
     StringRef scopeName, indexName;
-    SmallVector<uint8_t, 8> indicies;
+    SmallVector<uint8_t, 8> indices;
     switch (tag) {
     case ELFAttrs::File:
       scopeName = "FileAttributes";
@@ -162,12 +162,12 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
     case ELFAttrs::Section:
       scopeName = "SectionAttributes";
       indexName = "Sections";
-      parseIndexList(indicies);
+      parseIndexList(indices);
       break;
     case ELFAttrs::Symbol:
       scopeName = "SymbolAttributes";
       indexName = "Symbols";
-      parseIndexList(indicies);
+      parseIndexList(indices);
       break;
     default:
       return createStringError(errc::invalid_argument,
@@ -178,8 +178,8 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
 
     if (sw) {
       DictScope scope(*sw, scopeName);
-      if (!indicies.empty())
-        sw->printList(indexName, indicies);
+      if (!indices.empty())
+        sw->printList(indexName, indices);
       if (Error e = parseAttributeList(size - 5))
         return e;
     } else if (Error e = parseAttributeList(size - 5))
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 4cc4fe019b75b1..fb7e804fd7e843 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -79,8 +79,16 @@ void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
   SmallString<256> NameBuf;
   StringRef NameRef = Alloc.Name.toStringRef(NameBuf);
 
-  char *Mem = static_cast<char *>(operator new(N + sizeof(size_t) +
-                                               NameRef.size() + 1));
+  // We use malloc() and manually handle it returning null instead of calling
+  // operator new because we need all uses of NamedBufferAlloc to be
+  // deallocated with a call to free() due to needing to use malloc() in
+  // WritableMemoryBuffer::getNewUninitMemBuffer() to work around the out-of-
+  // memory handler installed by default in LLVM. See operator delete() member
+  // functions within this file for the paired call to free().
+  char *Mem =
+      static_cast<char *>(std::malloc(N + sizeof(size_t) + NameRef.size() + 1));
+  if (!Mem)
+    llvm::report_bad_alloc_error("Allocation failed");
   *reinterpret_cast<size_t *>(Mem + N) = NameRef.size();
   CopyStringRef(Mem + N + sizeof(size_t), NameRef);
   return Mem;
@@ -98,7 +106,7 @@ class MemoryBufferMem : public MB {
 
   /// Disable sized deallocation for MemoryBufferMem, because it has
   /// tail-allocated data.
-  void operator delete(void *p) { ::operator delete(p); }
+  void operator delete(void *p) { std::free(p); }
 
   StringRef getBufferIdentifier() const override {
     // The name is stored after the class itself.
@@ -225,7 +233,7 @@ class MemoryBufferMMapFile : public MB {
 
   /// Disable sized deallocation for MemoryBufferMMapFile, because it has
   /// tail-allocated data.
-  void operator delete(void *p) { ::operator delete(p); }
+  void operator delete(void *p) { std::free(p); }
 
   StringRef getBufferIdentifier() const override {
     // The name is stored after the class itself.
@@ -315,7 +323,14 @@ WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size,
   size_t RealLen = StringLen + Size + 1 + BufAlign.value();
   if (RealLen <= Size) // Check for rollover.
     return nullptr;
-  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+  // We use a call to malloc() rather than a call to a non-throwing operator
+  // new() because LLVM unconditionally installs an out of memory new handler
+  // when exceptions are disabled. This new handler intentionally crashes to
+  // aid with debugging, but that makes non-throwing new calls unhelpful.
+  // See MemoryBufferMem::operator delete() for the paired call to free(), and
+  // llvm::install_out_of_memory_new_handler() for the installation of the
+  // custom new handler.
+  char *Mem = static_cast<char *>(std::malloc(RealLen));
   if (!Mem)
     return nullptr;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index afa023220d357b..6223c211b33b68 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23881,7 +23881,7 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
 
   // For "scalar + vector of indices", just scale the indices. This only
   // applies to non-temporal scatters because there's no instruction that takes
-  // indicies.
+  // indices.
   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
     Offset =
         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index f80f424a584719..4af2e1b32e6b99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -517,12 +517,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   DumpCodeInstEmitter = nullptr;
   if (STM.dumpCode()) {
-    // For -dumpcode, get the assembler out of the streamer, even if it does
-    // not really want to let us have it. This only works with -filetype=obj.
-    bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
-    OutStreamer->setUseAssemblerInfoForParsing(true);
+    // For -dumpcode, get the assembler out of the streamer. This only works
+    // with -filetype=obj.
     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
-    OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
     if (Assembler)
       DumpCodeInstEmitter = Assembler->getEmitterPtr();
   }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d47a5f8ebb8157..c08c35c459843c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1388,9 +1388,6 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                     bool IsAtomic);
 
 public:
-  enum AMDGPUMatchResultTy {
-    Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
-  };
   enum OperandMode {
     OperandMode_Default,
     OperandMode_NSA,
@@ -5262,15 +5259,11 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                   Variant);
     // We order match statuses from least to most specific. We use most specific
     // status as resulting
-    // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32
-    if ((R == Match_Success) ||
-        (R == Match_PreferE32) ||
-        (R == Match_MissingFeature && Result != Match_PreferE32) ||
-        (R == Match_InvalidOperand && Result != Match_MissingFeature
-                                   && Result != Match_PreferE32) ||
-        (R == Match_MnemonicFail   && Result != Match_InvalidOperand
-                                   && Result != Match_MissingFeature
-                                   && Result != Match_PreferE32)) {
+    // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature
+    if (R == Match_Success || R == Match_MissingFeature ||
+        (R == Match_InvalidOperand && Result != Match_MissingFeature) ||
+        (R == Match_MnemonicFail && Result != Match_InvalidOperand &&
+         Result != Match_MissingFeature)) {
       Result = R;
       ErrorInfo = EI;
     }
@@ -5316,9 +5309,6 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_PreferE32:
-    return Error(IDLoc, "internal error: instruction without _e64 suffix "
-                        "should be encoded as e32");
   case Match_MnemonicFail:
     llvm_unreachable("Invalid instructions should have been handled already");
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8f741ffc58a827..89e83babcfef42 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7053,7 +7053,7 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
 
-  // 2. Mask off all other indicies except the required index within (1).
+  // 2. Mask off all other indices except the required index within (1).
   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
 
   // 3. Mask off the required index within the target vector.
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index ebb269c6e6e067..87297ac86b9d25 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -958,7 +958,7 @@ void DXILBitcodeWriter::writeTypeTable() {
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
   SmallVector<uint64_t, 64> TypeVals;
 
-  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndices();
 
   // Abbrev for TYPE_CODE_POINTER.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
@@ -2747,7 +2747,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
         CONSTANTS_SETTYPE_ABBREV)
       assert(false && "Unexpected abbrev ordering!");
@@ -2767,7 +2767,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,      // typeid
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
@@ -2789,7 +2789,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
@@ -2822,7 +2822,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
         (unsigned)FUNCTION_INST_CAST_ABBREV)
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
index d90ab968c5d313..9a8d0afa629261 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
@@ -1140,6 +1140,6 @@ unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
   return getGlobalBasicBlockID(BB);
 }
 
-uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndices() const {
   return Log2_32_Ceil(getTypes().size() + 1);
 }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
index 66a5d96080bc37..f0f91c6182e32e 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
@@ -236,7 +236,7 @@ class ValueEnumerator {
   void incorporateFunction(const Function &F);
 
   void purgeFunction();
-  uint64_t computeBitsRequiredForTypeIndicies() const;
+  uint64_t computeBitsRequiredForTypeIndices() const;
 
   void EnumerateType(Type *T);
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0a7483fc45b203..ad86c393ba7919 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14945,7 +14945,7 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
-  // If the vector extract indicies are not correct, add the appropriate
+  // If the vector extract indices are not correct, add the appropriate
   // vector_shuffle.
   int TgtElemArrayIdx;
   int InputSize = Input.getValueType().getScalarSizeInBits();
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 0a304d4cb7d907..fca3362f9a8b27 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -114,29 +114,30 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     ShiftAmount = llvm::countr_zero((uint64_t)Val);
     Val >>= ShiftAmount;
 
-    // If the remaining bits don't fit in 12 bits, we might be able to reduce the
-    // shift amount in order to use LUI which will zero the lower 12 bits.
+    // If the remaining bits don't fit in 12 bits, we might be able to reduce
+    // the shift amount in order to use LUI which will zero the lower 12 bits.
     if (ShiftAmount > 12 && !isInt<12>(Val)) {
-      if (isInt<32>((uint64_t)Val << 12)) {
-        // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
+      if (isInt<32>(Val << 12)) {
+        // Reduce the shift amount and add zeros to the LSBs so it will match
+        // LUI.
         ShiftAmount -= 12;
-        Val = (uint64_t)Val << 12;
-      } else if (isUInt<32>((uint64_t)Val << 12) &&
+        Val = Val << 12;
+      } else if (isUInt<32>(Val << 12) &&
                  STI.hasFeature(RISCV::FeatureStdExtZba)) {
         // Reduce the shift amount and add zeros to the LSBs so it will match
         // LUI, then shift left with SLLI.UW to clear the upper 32 set bits.
         ShiftAmount -= 12;
-        Val = ((uint64_t)Val << 12) | (0xffffffffull << 32);
+        Val = SignExtend64<32>(Val << 12);
         Unsigned = true;
       }
     }
 
     // Try to use SLLI_UW for Val when it is uint32 but not int32.
-    if (isUInt<32>((uint64_t)Val) && !isInt<32>((uint64_t)Val) &&
+    if (isUInt<32>(Val) && !isInt<32>(Val) &&
         STI.hasFeature(RISCV::FeatureStdExtZba)) {
-      // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
+      // Use LUI+ADDI(W) or LUI to compose, then clear the upper 32 bits with
       // SLLI_UW.
-      Val = ((uint64_t)Val) | (0xffffffffull << 32);
+      Val = SignExtend64<32>(Val);
       Unsigned = true;
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 5e6b7891449fec..7de48d8218f06e 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -72,7 +72,9 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
       // are reserved for HINT instructions.
       const MCInstrDesc &Desc = MI.getDesc();
       if (!Desc.mayLoad() && !Desc.mayStore() &&
-          !Desc.hasUnmodeledSideEffects())
+          !Desc.hasUnmodeledSideEffects() &&
+          MI.getOpcode() != RISCV::PseudoVSETVLI &&
+          MI.getOpcode() != RISCV::PseudoVSETIVLI)
         continue;
       // For PseudoVSETVLIX0, Rd = X0 has special meaning.
       if (MI.getOpcode() == RISCV::PseudoVSETVLIX0)
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1c815424bdfa62..363007d7b68b12 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -48,15 +48,13 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt(
 namespace {
 
 /// Given a virtual register \p Reg, return the corresponding VNInfo for it.
-/// This should never return nullptr.
+/// This will return nullptr if the virtual register is an implicit_def.
 static VNInfo *getVNInfoFromReg(Register Reg, const MachineInstr &MI,
                                 const LiveIntervals *LIS) {
   assert(Reg.isVirtual());
   auto &LI = LIS->getInterval(Reg);
   SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI);
-  VNInfo *VNI = LI.getVNInfoBefore(SI);
-  assert(VNI);
-  return VNI;
+  return LI.getVNInfoBefore(SI);
 }
 
 static unsigned getVLOpNum(const MachineInstr &MI) {
@@ -894,8 +892,12 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
            "Can't handle X0, X0 vsetvli yet");
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
-    else
-      NewInfo.setAVLRegDef(getVNInfoFromReg(AVLReg, MI, LIS), AVLReg);
+    else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS))
+      NewInfo.setAVLRegDef(VNI, AVLReg);
+    else {
+      assert(MI.getOperand(1).isUndef());
+      NewInfo.setAVLIgnored();
+    }
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -966,9 +968,11 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       }
       else
         InstrInfo.setAVLImm(Imm);
+    } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) {
+      InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
     } else {
-      InstrInfo.setAVLRegDef(getVNInfoFromReg(VLOp.getReg(), MI, LIS),
-                             VLOp.getReg());
+      assert(VLOp.isUndef());
+      InstrInfo.setAVLIgnored();
     }
   } else {
     assert(isScalarExtractInstr(MI));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4adc26f6289144..317a6d7d4c52f3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6181,7 +6181,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 0,
 //===----------------------------------------------------------------------===//
 
 // Pseudos.
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
 // Due to rs1=X0 having special meaning, we need a GPRNoX0 register class for
 // the when we aren't using one of the special X0 encodings. Otherwise it could
 // be accidentally be made X0 by MachineIR optimizations. To satisfy the
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 2ebe5bdc47715b..ad015808604487 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -114,12 +114,9 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Bound is an approximation that accounts for the maximum used register
   // number and number of generated OpLabels
   unsigned Bound = 2 * (ST->getBound() + 1) + NLabels;
-  bool FlagToRestore = OutStreamer->getUseAssemblerInfoForParsing();
-  OutStreamer->setUseAssemblerInfoForParsing(true);
   if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
     Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
                          Bound, VersionTuple(Major, Minor, 0, Bound));
-  OutStreamer->setUseAssemblerInfoForParsing(FlagToRestore);
 }
 
 void SPIRVAsmPrinter::emitFunctionHeader() {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a57c10e784d9c8..e7c70e3872ad13 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22692,11 +22692,14 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
 
   // Only promote the compare up to I32 if it is a 16 bit operation
-  // with an immediate.  16 bit immediates are to be avoided.
+  // with an immediate. 16 bit immediates are to be avoided unless the target
+  // isn't slowed down by length changing prefixes, we're optimizing for
+  // codesize or the comparison is with a folded load.
   if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
+      !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
       !DAG.getMachineFunction().getFunction().hasMinSize()) {
-    ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
-    ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+    auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
+    auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
     // Don't do this if the immediate can fit in 8-bits.
     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index fd05e16ac1ceff..60c024556ff13f 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -146,7 +146,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
           addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
               .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
       MachineOperand &MO = NewMI->getOperand(2);
-      MO.setReg(GR64Cand);
+      MO.setReg(GR64Cand ? GR64Cand : X86::RAX);
       MO.setIsKill(true);
       // tileloadd (%sp, %idx), %tmm
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -157,7 +157,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
         // restore %rax
         // mov (%sp) %rax
         addFrameReference(
-            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), X86::RAX), StrideSS);
       }
       MI.eraseFromParent();
       Changed = true;
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 84fd88806154e3..5716fd0ea4ab96 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -47,7 +47,7 @@ struct LowererBase {
   ConstantPointerNull *const NullPtr;
 
   LowererBase(Module &M);
-  Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
+  CallInst *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
 enum class ABI {
@@ -85,6 +85,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
   SmallVector<CallInst*, 2> SwiftErrorOps;
   SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
+  SmallVector<CallInst *, 2> SymmetricTransfers;
 
   // Field indexes for special fields in the switch lowering.
   struct SwitchFieldIndex {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 4eb6e75d09fa53..1d9cf185b75a71 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -113,21 +113,24 @@ class CoroCloner {
   /// ABIs.
   AnyCoroSuspendInst *ActiveSuspend = nullptr;
 
+  TargetTransformInfo &TTI;
+
 public:
   /// Create a cloner for a switch lowering.
   CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
-             Kind FKind)
+             Kind FKind, TargetTransformInfo &TTI)
       : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape), FKind(FKind),
-        Builder(OrigF.getContext()) {
+        Builder(OrigF.getContext()), TTI(TTI) {
     assert(Shape.ABI == coro::ABI::Switch);
   }
 
   /// Create a cloner for a continuation lowering.
   CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
-             Function *NewF, AnyCoroSuspendInst *ActiveSuspend)
+             Function *NewF, AnyCoroSuspendInst *ActiveSuspend,
+             TargetTransformInfo &TTI)
       : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
         FKind(Shape.ABI == coro::ABI::Async ? Kind::Async : Kind::Continuation),
-        Builder(OrigF.getContext()), ActiveSuspend(ActiveSuspend) {
+        Builder(OrigF.getContext()), ActiveSuspend(ActiveSuspend), TTI(TTI) {
     assert(Shape.ABI == coro::ABI::Retcon ||
            Shape.ABI == coro::ABI::RetconOnce || Shape.ABI == coro::ABI::Async);
     assert(NewF && "need existing function for continuation");
@@ -171,7 +174,8 @@ class CoroCloner {
 // Lower the intrinisc in CoroEarly phase if coroutine frame doesn't escape
 // and it is known that other transformations, for example, sanitizers
 // won't lead to incorrect code.
-static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
+static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB,
+                              coro::Shape &Shape) {
   auto Wrapper = CB->getWrapperFunction();
   auto Awaiter = CB->getAwaiter();
   auto FramePtr = CB->getFrame();
@@ -206,6 +210,31 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
     llvm_unreachable("Unexpected coro_await_suspend invocation method");
   }
 
+  if (CB->getCalledFunction()->getIntrinsicID() ==
+      Intrinsic::coro_await_suspend_handle) {
+    // Follow the lowered await_suspend call above with a lowered resume call
+    // to the returned coroutine.
+    if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
+      // If the await_suspend call is an invoke, we continue in the next block.
+      Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstInsertionPt());
+    }
+
+    coro::LowererBase LB(*Wrapper->getParent());
+    auto *ResumeAddr = LB.makeSubFnCall(NewCall, CoroSubFnInst::ResumeIndex,
+                                        &*Builder.GetInsertPoint());
+
+    LLVMContext &Ctx = Builder.getContext();
+    FunctionType *ResumeTy = FunctionType::get(
+        Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false);
+    auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall});
+
+    // We can't insert the 'ret' instruction and adjust the cc until the
+    // function has been split, so remember this for later.
+    Shape.SymmetricTransfers.push_back(ResumeCall);
+
+    NewCall = ResumeCall;
+  }
+
   CB->replaceAllUsesWith(NewCall);
   CB->eraseFromParent();
 }
@@ -213,7 +242,7 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
 static void lowerAwaitSuspends(Function &F, coro::Shape &Shape) {
   IRBuilder<> Builder(F.getContext());
   for (auto *AWS : Shape.CoroAwaitSuspends)
-    lowerAwaitSuspend(Builder, AWS);
+    lowerAwaitSuspend(Builder, AWS, Shape);
 }
 
 static void maybeFreeRetconStorage(IRBuilder<> &Builder,
@@ -1056,6 +1085,25 @@ void CoroCloner::create() {
   // Set up the new entry block.
   replaceEntryBlock();
 
+  // Turn symmetric transfers into musttail calls.
+  for (CallInst *ResumeCall : Shape.SymmetricTransfers) {
+    ResumeCall = cast<CallInst>(VMap[ResumeCall]);
+    ResumeCall->setCallingConv(NewF->getCallingConv());
+    if (TTI.supportsTailCallFor(ResumeCall)) {
+      // FIXME: Could we support symmetric transfer effectively without
+      // musttail?
+      ResumeCall->setTailCallKind(CallInst::TCK_MustTail);
+    }
+
+    // Put a 'ret void' after the call, and split any remaining instructions to
+    // an unreachable block.
+    BasicBlock *BB = ResumeCall->getParent();
+    BB->splitBasicBlock(ResumeCall->getNextNode());
+    Builder.SetInsertPoint(BB->getTerminator());
+    Builder.CreateRetVoid();
+    BB->getTerminator()->eraseFromParent();
+  }
+
   Builder.SetInsertPoint(&NewF->getEntryBlock().front());
   NewFramePtr = deriveNewFramePointer();
 
@@ -1169,147 +1217,6 @@ static void postSplitCleanup(Function &F) {
 #endif
 }
 
-// Assuming we arrived at the block NewBlock from Prev instruction, store
-// PHI's incoming values in the ResolvedValues map.
-static void
-scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
-                          DenseMap<Value *, Value *> &ResolvedValues) {
-  auto *PrevBB = Prev->getParent();
-  for (PHINode &PN : NewBlock->phis()) {
-    auto V = PN.getIncomingValueForBlock(PrevBB);
-    // See if we already resolved it.
-    auto VI = ResolvedValues.find(V);
-    if (VI != ResolvedValues.end())
-      V = VI->second;
-    // Remember the value.
-    ResolvedValues[&PN] = V;
-  }
-}
-
-// Replace a sequence of branches leading to a ret, with a clone of a ret
-// instruction. Suspend instruction represented by a switch, track the PHI
-// values and select the correct case successor when possible.
-static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
-  // There is nothing to simplify.
-  if (isa<ReturnInst>(InitialInst))
-    return false;
-
-  DenseMap<Value *, Value *> ResolvedValues;
-  assert(InitialInst->getModule());
-  const DataLayout &DL = InitialInst->getModule()->getDataLayout();
-
-  auto TryResolveConstant = [&ResolvedValues](Value *V) {
-    auto It = ResolvedValues.find(V);
-    if (It != ResolvedValues.end())
-      V = It->second;
-    return dyn_cast<ConstantInt>(V);
-  };
-
-  Instruction *I = InitialInst;
-  while (true) {
-    if (isa<ReturnInst>(I)) {
-      assert(!cast<ReturnInst>(I)->getReturnValue());
-      ReplaceInstWithInst(InitialInst, I->clone());
-      return true;
-    }
-
-    if (auto *BR = dyn_cast<BranchInst>(I)) {
-      unsigned SuccIndex = 0;
-      if (BR->isConditional()) {
-        // Handle the case the condition of the conditional branch is constant.
-        // e.g.,
-        //
-        //     br i1 false, label %cleanup, label %CoroEnd
-        //
-        // It is possible during the transformation. We could continue the
-        // simplifying in this case.
-        ConstantInt *Cond = TryResolveConstant(BR->getCondition());
-        if (!Cond)
-          return false;
-
-        SuccIndex = Cond->isOne() ? 0 : 1;
-      }
-
-      BasicBlock *Succ = BR->getSuccessor(SuccIndex);
-      scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
-      I = Succ->getFirstNonPHIOrDbgOrLifetime();
-      continue;
-    }
-
-    if (auto *Cmp = dyn_cast<CmpInst>(I)) {
-      // If the case number of suspended switch instruction is reduced to
-      // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
-      // Try to constant fold it.
-      ConstantInt *Cond0 = TryResolveConstant(Cmp->getOperand(0));
-      ConstantInt *Cond1 = TryResolveConstant(Cmp->getOperand(1));
-      if (Cond0 && Cond1) {
-        ConstantInt *Result =
-            dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands(
-                Cmp->getPredicate(), Cond0, Cond1, DL));
-        if (Result) {
-          ResolvedValues[Cmp] = Result;
-          I = I->getNextNode();
-          continue;
-        }
-      }
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(I)) {
-      ConstantInt *Cond = TryResolveConstant(SI->getCondition());
-      if (!Cond)
-        return false;
-
-      BasicBlock *Succ = SI->findCaseValue(Cond)->getCaseSuccessor();
-      scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
-      I = Succ->getFirstNonPHIOrDbgOrLifetime();
-      continue;
-    }
-
-    if (I->isDebugOrPseudoInst() || I->isLifetimeStartOrEnd() ||
-        wouldInstructionBeTriviallyDead(I)) {
-      // We can skip instructions without side effects. If their values are
-      // needed, we'll notice later, e.g. when hitting a conditional branch.
-      I = I->getNextNode();
-      continue;
-    }
-
-    break;
-  }
-
-  return false;
-}
-
-// Check whether CI obeys the rules of musttail attribute.
-static bool shouldBeMustTail(const CallInst &CI, const Function &F) {
-  if (CI.isInlineAsm())
-    return false;
-
-  // Match prototypes and calling conventions of resume function.
-  FunctionType *CalleeTy = CI.getFunctionType();
-  if (!CalleeTy->getReturnType()->isVoidTy() || (CalleeTy->getNumParams() != 1))
-    return false;
-
-  Type *CalleeParmTy = CalleeTy->getParamType(0);
-  if (!CalleeParmTy->isPointerTy() ||
-      (CalleeParmTy->getPointerAddressSpace() != 0))
-    return false;
-
-  if (CI.getCallingConv() != F.getCallingConv())
-    return false;
-
-  // CI should not has any ABI-impacting function attributes.
-  static const Attribute::AttrKind ABIAttrs[] = {
-      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
-      Attribute::Preallocated, Attribute::InReg,     Attribute::Returned,
-      Attribute::SwiftSelf,    Attribute::SwiftError};
-  AttributeList Attrs = CI.getAttributes();
-  for (auto AK : ABIAttrs)
-    if (Attrs.hasParamAttr(0, AK))
-      return false;
-
-  return true;
-}
-
 // Coroutine has no suspend points. Remove heap allocation for the coroutine
 // frame if possible.
 static void handleNoSuspendCoroutine(coro::Shape &Shape) {
@@ -1523,24 +1430,16 @@ struct SwitchCoroutineSplitter {
 
     createResumeEntryBlock(F, Shape);
     auto *ResumeClone =
-        createClone(F, ".resume", Shape, CoroCloner::Kind::SwitchResume);
+        createClone(F, ".resume", Shape, CoroCloner::Kind::SwitchResume, TTI);
     auto *DestroyClone =
-        createClone(F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind);
+        createClone(F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind, TTI);
     auto *CleanupClone =
-        createClone(F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup);
+        createClone(F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup, TTI);
 
     postSplitCleanup(*ResumeClone);
     postSplitCleanup(*DestroyClone);
     postSplitCleanup(*CleanupClone);
 
-    // Adding musttail call to support symmetric transfer.
-    // Skip targets which don't support tail call.
-    //
-    // FIXME: Could we support symmetric transfer effectively without musttail
-    // call?
-    if (TTI.supportsTailCalls())
-      addMustTailToCoroResumes(*ResumeClone, TTI);
-
     // Store addresses resume/destroy/cleanup functions in the coroutine frame.
     updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
 
@@ -1560,8 +1459,9 @@ struct SwitchCoroutineSplitter {
   // new entry block and replacing coro.suspend an appropriate value to force
   // resume or cleanup pass for every suspend point.
   static Function *createClone(Function &F, const Twine &Suffix,
-                               coro::Shape &Shape, CoroCloner::Kind FKind) {
-    CoroCloner Cloner(F, Suffix, Shape, FKind);
+                               coro::Shape &Shape, CoroCloner::Kind FKind,
+                               TargetTransformInfo &TTI) {
+    CoroCloner Cloner(F, Suffix, Shape, FKind, TTI);
     Cloner.create();
     return Cloner.getFunction();
   }
@@ -1662,34 +1562,6 @@ struct SwitchCoroutineSplitter {
     Shape.SwitchLowering.ResumeEntryBlock = NewEntry;
   }
 
-  // Add musttail to any resume instructions that is immediately followed by a
-  // suspend (i.e. ret). We do this even in -O0 to support guaranteed tail call
-  // for symmetrical coroutine control transfer (C++ Coroutines TS extension).
-  // This transformation is done only in the resume part of the coroutine that
-  // has identical signature and calling convention as the coro.resume call.
-  static void addMustTailToCoroResumes(Function &F, TargetTransformInfo &TTI) {
-    bool Changed = false;
-
-    // Collect potential resume instructions.
-    SmallVector<CallInst *, 4> Resumes;
-    for (auto &I : instructions(F))
-      if (auto *Call = dyn_cast<CallInst>(&I))
-        if (shouldBeMustTail(*Call, F))
-          Resumes.push_back(Call);
-
-    // Set musttail on those that are followed by a ret instruction.
-    for (CallInst *Call : Resumes)
-      // Skip targets which don't support tail call on the specific case.
-      if (TTI.supportsTailCallFor(Call) &&
-          simplifyTerminatorLeadingToRet(Call->getNextNode())) {
-        Call->setTailCallKind(CallInst::TCK_MustTail);
-        Changed = true;
-      }
-
-    if (Changed)
-      removeUnreachableBlocks(F);
-  }
-
   // Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
   static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
                               Function *DestroyFn, Function *CleanupFn) {
@@ -1893,12 +1765,13 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
     auto *Suspend = Shape.CoroSuspends[Idx];
     auto *Clone = Clones[Idx];
 
-    CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend).create();
+    CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend, TTI).create();
   }
 }
 
 static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
-                                 SmallVectorImpl<Function *> &Clones) {
+                                 SmallVectorImpl<Function *> &Clones,
+                                 TargetTransformInfo &TTI) {
   assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce);
   assert(Clones.empty());
 
@@ -2021,7 +1894,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
     auto Suspend = Shape.CoroSuspends[i];
     auto Clone = Clones[i];
 
-    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend).create();
+    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend, TTI).create();
   }
 }
 
@@ -2073,7 +1946,7 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
       break;
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
-      splitRetconCoroutine(F, Shape, Clones);
+      splitRetconCoroutine(F, Shape, Clones, TTI);
       break;
     }
   }
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index a1c78d6a44ef46..1a92bc1636257b 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -47,8 +47,8 @@ coro::LowererBase::LowererBase(Module &M)
 //
 //    call ptr @llvm.coro.subfn.addr(ptr %Arg, i8 %index)
 
-Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
-                                        Instruction *InsertPt) {
+CallInst *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
+                                           Instruction *InsertPt) {
   auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index);
   auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 52803e9bea451e..dd8eb4688d3c9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -331,7 +331,7 @@ Instruction *
 InstCombinerImpl::foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN) {
   auto *FirstIVI = cast<InsertValueInst>(PN.getIncomingValue(0));
 
-  // Scan to see if all operands are `insertvalue`'s with the same indicies,
+  // Scan to see if all operands are `insertvalue`'s with the same indices,
   // and all have a single use.
   for (Value *V : drop_begin(PN.incoming_values())) {
     auto *I = dyn_cast<InsertValueInst>(V);
@@ -371,7 +371,7 @@ Instruction *
 InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) {
   auto *FirstEVI = cast<ExtractValueInst>(PN.getIncomingValue(0));
 
-  // Scan to see if all operands are `extractvalue`'s with the same indicies,
+  // Scan to see if all operands are `extractvalue`'s with the same indices,
   // and all have a single use.
   for (Value *V : drop_begin(PN.incoming_values())) {
     auto *I = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 261c1259c9c961..b5333c532280ca 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -238,18 +238,6 @@ class CallInfo {
   const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
 };
 
-static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
-  static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
-                                      LLVMContext::MD_alias_scope,
-                                      LLVMContext::MD_noalias,
-                                      LLVMContext::MD_range,
-                                      LLVMContext::MD_fpmath,
-                                      LLVMContext::MD_invariant_load,
-                                      LLVMContext::MD_invariant_group,
-                                      LLVMContext::MD_access_group};
-  combineMetadata(ReplInst, I, KnownIDs, true);
-}
-
 // This pass hoists common computations across branches sharing common
 // dominator. The primary goal is to reduce the code size, and in some
 // cases reduce critical path (by exposing more ILP).
@@ -996,8 +984,8 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
         MSSAUpdater->removeMemoryAccess(OldMA);
       }
 
+      combineMetadataForCSE(Repl, I, true);
       Repl->andIRFlags(I);
-      combineKnownMetadata(Repl, I);
       I->replaceAllUsesWith(Repl);
       // Also invalidate the Alias Analysis cache.
       MD->removeInstruction(I);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 9f85396cde2598..1a9eaf28f6e47e 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1008,7 +1008,7 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   }
 
   IRBuilder<> Builder(GEP);
-  // For trivial GEP chains, we can swap the indicies.
+  // For trivial GEP chains, we can swap the indices.
   Value *NewSrc = Builder.CreateGEP(
       GEP->getSourceElementType(), PtrGEP->getPointerOperand(),
       SmallVector<Value *, 4>(GEP->indices()), "", IsChainInBounds);
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 101b70d8def4a4..54d46117729c98 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -1061,7 +1061,7 @@ void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network,
   assert(NumJumps > 0 && "Too few jumps in a function");
 
   // Introducing dummy source/sink pairs to allow flow circulation.
-  // The nodes corresponding to blocks of the function have indicies in
+  // The nodes corresponding to blocks of the function have indices in
   // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the
   // next four values.
   uint64_t S = 2 * NumBlocks;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2e0a39c4b4fdc3..d21b5e1cc04178 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6485,7 +6485,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
-  SmallVector<int> ReuseShuffleIndicies;
+  SmallVector<int> ReuseShuffleIndices;
   SmallVector<Value *> UniqueValues;
   SmallVector<Value *> NonUniqueValueVL;
   auto TryToFindDuplicates = [&](const InstructionsState &S,
@@ -6494,19 +6494,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     DenseMap<Value *, unsigned> UniquePositions(VL.size());
     for (Value *V : VL) {
       if (isConstant(V)) {
-        ReuseShuffleIndicies.emplace_back(
+        ReuseShuffleIndices.emplace_back(
             isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
         UniqueValues.emplace_back(V);
         continue;
       }
       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-      ReuseShuffleIndicies.emplace_back(Res.first->second);
+      ReuseShuffleIndices.emplace_back(Res.first->second);
       if (Res.second)
         UniqueValues.emplace_back(V);
     }
     size_t NumUniqueScalarValues = UniqueValues.size();
     if (NumUniqueScalarValues == VL.size()) {
-      ReuseShuffleIndicies.clear();
+      ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
       if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
@@ -6532,7 +6532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             })) {
           unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
           if (PWSz == VL.size()) {
-            ReuseShuffleIndicies.clear();
+            ReuseShuffleIndices.clear();
           } else {
             NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
             NonUniqueValueVL.append(PWSz - UniqueValues.size(),
@@ -6579,7 +6579,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6590,7 +6590,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6694,7 +6694,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6722,7 +6722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
         if (TryToFindDuplicates(S))
           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+                       ReuseShuffleIndices);
         return;
       }
     } else {
@@ -6745,7 +6745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         << ") is already in tree.\n");
       if (TryToFindDuplicates(S))
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+                     ReuseShuffleIndices);
       return;
     }
   }
@@ -6757,7 +6757,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
         if (TryToFindDuplicates(S))
           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+                       ReuseShuffleIndices);
         return;
       }
     }
@@ -6810,7 +6810,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
   if (State == TreeEntry::NeedToGather) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                 ReuseShuffleIndicies);
+                 ReuseShuffleIndices);
     return;
   }
 
@@ -6832,7 +6832,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                 ReuseShuffleIndicies);
+                 ReuseShuffleIndices);
     NonScheduledFirst.insert(VL.front());
     return;
   }
@@ -6845,7 +6845,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       auto *PH = cast<PHINode>(VL0);
 
       TreeEntry *TE =
-          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       // Keeps the reordered operands to avoid code duplication.
@@ -6862,7 +6862,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (CurrentOrder.empty()) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+                     ReuseShuffleIndices);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -6881,7 +6881,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Insert new order with initial value 0, if it does not exist,
       // otherwise return the iterator to the existing one.
       newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies, CurrentOrder);
+                   ReuseShuffleIndices, CurrentOrder);
       // This is a special case, as it does not gather, but at the same time
       // we are not extending buildTree_rec() towards the operands.
       ValueList Op0;
@@ -6890,7 +6890,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return;
     }
     case Instruction::InsertElement: {
-      assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
+      assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
 
       auto OrdCompare = [](const std::pair<int, int> &P1,
                            const std::pair<int, int> &P2) {
@@ -6941,12 +6941,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (CurrentOrder.empty()) {
           // Original loads are consecutive and does not require reordering.
           TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndicies);
+                            ReuseShuffleIndices);
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         } else {
           // Need to reorder.
           TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndicies, CurrentOrder);
+                            ReuseShuffleIndices, CurrentOrder);
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
         }
         TE->setOperandsInOrder();
@@ -6955,10 +6955,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
           TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndicies);
+                            UserTreeIdx, ReuseShuffleIndices);
         } else {
           TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
+                            UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
         }
         TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
@@ -6966,7 +6966,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
-                          UserTreeIdx, ReuseShuffleIndicies);
+                          UserTreeIdx, ReuseShuffleIndices);
         TE->setOperandsInOrder();
         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
@@ -7020,7 +7020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
       }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       TE->setOperandsInOrder();
@@ -7039,7 +7039,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check that all of the compares have the same predicate.
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
@@ -7100,7 +7100,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Or:
     case Instruction::Xor: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -7128,7 +7128,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::GetElementPtr: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       SmallVector<ValueList, 2> Operands(2);
       // Prepare the operand vector for pointer operands.
@@ -7194,14 +7194,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (CurrentOrder.empty()) {
         // Original stores are consecutive and does not require reordering.
         TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndicies);
+                                     ReuseShuffleIndices);
         TE->setOperandsInOrder();
         buildTree_rec(Operands, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
       } else {
         fixupOrderingIndices(CurrentOrder);
         TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndicies, CurrentOrder);
+                                     ReuseShuffleIndices, CurrentOrder);
         TE->setOperandsInOrder();
         buildTree_rec(Operands, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
@@ -7215,7 +7215,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isCommutative(VL0)) {
@@ -7261,7 +7261,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::ShuffleVector: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -12107,8 +12107,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   unsigned VF = E->getVectorFactor();
 
   bool NeedFreeze = false;
-  SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
-                                        E->ReuseShuffleIndices.end());
+  SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
+                                       E->ReuseShuffleIndices.end());
   SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
   // Build a mask out of the reorder indices and reorder scalars per this
   // mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c2..efe8c21874a3ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -171,6 +171,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   case Instruction::ICmp:
   case Instruction::FCmp:
     return IntegerType::get(Ctx, 1);
+  case Instruction::AddrSpaceCast:
   case Instruction::Alloca:
   case Instruction::BitCast:
   case Instruction::Trunc:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
new file mode 100644
index 00000000000000..2a210a5a445b94
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
@@ -0,0 +1,756 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+define void @test_invar_dependence_before_positive_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_1_different_access_sizes(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_1_different_access_sizes'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i8 %t, ptr %gep, align 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  %t = trunc i32 %l to i8
+  store i8 %t, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_1_different_access_sizes(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_1_different_access_sizes'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i64, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %t, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i64, ptr %a
+  %t = trunc i64 %l to i32
+  store i32 %t, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_negative_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_negative_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 100
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_negative_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_negative_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 100
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_invar_dependence_not_before_negative_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_negative_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 99
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_negative_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_negative_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 99
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_before_1(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_before_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.off, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep.off
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_before_2(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_before_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %gep.off
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_not_before_1(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_not_before_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.off, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep.off
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_not_before_2(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_not_before_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %gep.off
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_via_loop_guard_positive_strided_access_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_via_loop_guard_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 4
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_via_loop_guard_positive_strided_access_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_via_loop_guard_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 4
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @test_invar_dependence_not_before_via_loop_guard_positive_strided_access_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_via_loop_guard_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 3
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_via_loop_guard_positive_strided_access_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_via_loop_guard_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 3
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_via_loop_guard_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_via_loop_guard_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  %c = icmp sge i32 %off, 0
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_via_loop_guard_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_via_loop_guard_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  %c = icmp sge i32 %off, 0
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index f84d476fc12271..bd5dc6e2070986 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -157,10 +157,8 @@ declare void @undef_func()
 ; GFX908: .amdhsa_next_free_vgpr 32
 ; GFX90A: .amdhsa_next_free_vgpr 64
 ; GFX90A: .amdhsa_accum_offset 32
-; GCN908: NumVgprs: 128
-; GCN908: NumAgprs: 128
-; GCN90A: NumVgprs: 256
-; GCN90A: NumAgprs: 256
+; GCN:    NumVgprs: 32
+; GCN:    NumAgprs: 32
 ; GFX908: TotalNumVgprs: 32
 ; GFX90A: TotalNumVgprs: 64
 ; GFX908: VGPRBlocks: 7
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index 9eacb88066c0f3..1cc5b7f7d14eec 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -88,8 +88,8 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 {
 ; GFX900: ScratchSize: 132
 ; GFX908: NumVgprs: 252
 ; GFX908: ScratchSize: 0
-; GCN900:    VGPRBlocks: 63
-; GCN908:    VGPRBlocks: 62
+; GFX900:    VGPRBlocks: 63
+; GFX908:    VGPRBlocks: 62
 ; GFX900:    NumVGPRsForWavesPerEU: 256
 ; GFX908:    NumVGPRsForWavesPerEU: 252
 define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index 187f758b780204..0a7fa38b0c8abb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -236,11 +236,12 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -248,29 +249,40 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vl8re32.v v8, (a3)
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vl8re32.v v8, (a1)
-; CHECK-NEXT:    vl8re32.v v16, (a2)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vl8re32.v v8, (a3)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v16, (a2)
 ; CHECK-NEXT:    vadd.vv v0, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v24, v24, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v8, v24, v8
 ; CHECK-NEXT:    vadd.vv v24, v0, v16
 ; CHECK-NEXT:    vadd.vx v16, v8, a4
 ; CHECK-NEXT:    vadd.vx v8, v24, a4
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 647d3158b6167f..fa62143546df60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -39,11 +39,11 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call callee_scalable_vector_split_indirect
 ; RV32-NEXT:    addi sp, s0, -144
@@ -70,11 +70,11 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a1, a0, a1
 ; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call callee_scalable_vector_split_indirect
 ; RV64-NEXT:    addi sp, s0, -144
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 1b50214bbf164d..9e9a8b8a4b644e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -19,9 +19,9 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x half> %r
@@ -42,9 +42,9 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x half> %r
@@ -65,9 +65,9 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x half> %r
@@ -88,9 +88,9 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x half> %r
@@ -112,9 +112,9 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <32 x half> %r
@@ -135,9 +135,9 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %r
@@ -158,9 +158,9 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x float> %r
@@ -181,9 +181,9 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x float> %r
@@ -204,9 +204,9 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x float> %r
@@ -227,9 +227,9 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x double> %r
@@ -250,9 +250,9 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %r
@@ -273,9 +273,9 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index a8e4af2d7368e8..6320b07125bb0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -359,13 +359,13 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    feq.d a0, fa3, fa3
 ; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
+; RV32-NEXT:    fld fa3, 40(sp)
 ; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    feq.d a2, fa2, fa2
-; RV32-NEXT:    fmax.d fa3, fa2, fa5
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV32-NEXT:    feq.d a2, fa3, fa3
+; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
 ; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
 ; RV32-NEXT:    fld fa3, 32(sp)
@@ -460,13 +460,13 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    feq.d a0, fa3, fa3
 ; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
+; RV64-NEXT:    fld fa3, 40(sp)
 ; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    feq.d a2, fa2, fa2
-; RV64-NEXT:    fmax.d fa3, fa2, fa5
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV64-NEXT:    feq.d a2, fa3, fa3
+; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
 ; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
 ; RV64-NEXT:    fld fa3, 32(sp)
@@ -557,7 +557,6 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa4, v8
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
@@ -566,9 +565,10 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fmin.d fa2, fa2, fa5
 ; RV32-NEXT:    fcvt.wu.d a2, fa2, rtz
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fld fa2, 48(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
+; RV32-NEXT:    fld fa2, 48(sp)
 ; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fmax.d fa4, fa2, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
@@ -633,7 +633,6 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
@@ -642,9 +641,10 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fmin.d fa2, fa2, fa5
 ; RV64-NEXT:    fcvt.lu.d a2, fa2, rtz
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fld fa2, 48(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
+; RV64-NEXT:    fld fa2, 48(sp)
 ; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-NEXT:    fmax.d fa4, fa2, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index 6ffa6ac250ed7f..9c76b83d0974af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -132,12 +132,12 @@ define <3 x float> @si2fp_v3i1_v3f32(<3 x i1> %x) {
 define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; ZVFH32-LABEL: si2fp_v3i7_v3f32:
 ; ZVFH32:       # %bb.0:
-; ZVFH32-NEXT:    lw a1, 4(a0)
-; ZVFH32-NEXT:    lw a2, 0(a0)
-; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    lw a1, 0(a0)
+; ZVFH32-NEXT:    lw a2, 4(a0)
 ; ZVFH32-NEXT:    lw a0, 8(a0)
-; ZVFH32-NEXT:    vmv.v.x v8, a2
-; ZVFH32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    vmv.v.x v8, a1
+; ZVFH32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH32-NEXT:    vadd.vv v8, v8, v8
@@ -149,12 +149,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFH64-LABEL: si2fp_v3i7_v3f32:
 ; ZVFH64:       # %bb.0:
-; ZVFH64-NEXT:    ld a1, 8(a0)
-; ZVFH64-NEXT:    ld a2, 0(a0)
-; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    ld a1, 0(a0)
+; ZVFH64-NEXT:    ld a2, 8(a0)
 ; ZVFH64-NEXT:    ld a0, 16(a0)
-; ZVFH64-NEXT:    vmv.v.x v8, a2
-; ZVFH64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    vmv.v.x v8, a1
+; ZVFH64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH64-NEXT:    vadd.vv v8, v8, v8
@@ -166,12 +166,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN32-LABEL: si2fp_v3i7_v3f32:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    lw a1, 4(a0)
-; ZVFHMIN32-NEXT:    lw a2, 0(a0)
-; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    lw a1, 0(a0)
+; ZVFHMIN32-NEXT:    lw a2, 4(a0)
 ; ZVFHMIN32-NEXT:    lw a0, 8(a0)
-; ZVFHMIN32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vadd.vv v8, v8, v8
@@ -183,12 +183,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN64-LABEL: si2fp_v3i7_v3f32:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    ld a1, 8(a0)
-; ZVFHMIN64-NEXT:    ld a2, 0(a0)
-; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    ld a1, 0(a0)
+; ZVFHMIN64-NEXT:    ld a2, 8(a0)
 ; ZVFHMIN64-NEXT:    ld a0, 16(a0)
-; ZVFHMIN64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vadd.vv v8, v8, v8
@@ -205,12 +205,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; ZVFH32-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFH32:       # %bb.0:
-; ZVFH32-NEXT:    lw a1, 4(a0)
-; ZVFH32-NEXT:    lw a2, 0(a0)
-; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    lw a1, 0(a0)
+; ZVFH32-NEXT:    lw a2, 4(a0)
 ; ZVFH32-NEXT:    lw a0, 8(a0)
-; ZVFH32-NEXT:    vmv.v.x v8, a2
-; ZVFH32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    vmv.v.x v8, a1
+; ZVFH32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH32-NEXT:    li a0, 127
@@ -222,12 +222,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFH64-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFH64:       # %bb.0:
-; ZVFH64-NEXT:    ld a1, 8(a0)
-; ZVFH64-NEXT:    ld a2, 0(a0)
-; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    ld a1, 0(a0)
+; ZVFH64-NEXT:    ld a2, 8(a0)
 ; ZVFH64-NEXT:    ld a0, 16(a0)
-; ZVFH64-NEXT:    vmv.v.x v8, a2
-; ZVFH64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    vmv.v.x v8, a1
+; ZVFH64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH64-NEXT:    li a0, 127
@@ -239,12 +239,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN32-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    lw a1, 4(a0)
-; ZVFHMIN32-NEXT:    lw a2, 0(a0)
-; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    lw a1, 0(a0)
+; ZVFHMIN32-NEXT:    lw a2, 4(a0)
 ; ZVFHMIN32-NEXT:    lw a0, 8(a0)
-; ZVFHMIN32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    li a0, 127
@@ -256,12 +256,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN64-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    ld a1, 8(a0)
-; ZVFHMIN64-NEXT:    ld a2, 0(a0)
-; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    ld a1, 0(a0)
+; ZVFHMIN64-NEXT:    ld a2, 8(a0)
 ; ZVFHMIN64-NEXT:    ld a0, 16(a0)
-; ZVFHMIN64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    li a0, 127
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 592ce6fc5be0bd..4f4f0a09de7484 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -1183,38 +1183,38 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_contigous:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 2(a0)
-; CHECK-NEXT:    lbu a3, 3(a0)
-; CHECK-NEXT:    lbu a4, 4(a0)
-; CHECK-NEXT:    lbu a5, 5(a0)
-; CHECK-NEXT:    lbu a6, 6(a0)
-; CHECK-NEXT:    lbu a7, 7(a0)
-; CHECK-NEXT:    lbu t0, 9(a0)
-; CHECK-NEXT:    lbu t1, 10(a0)
-; CHECK-NEXT:    lbu t2, 11(a0)
-; CHECK-NEXT:    lbu t3, 12(a0)
-; CHECK-NEXT:    lbu t4, 13(a0)
-; CHECK-NEXT:    lbu t5, 14(a0)
-; CHECK-NEXT:    lbu t6, 15(a0)
+; CHECK-NEXT:    addi a1, a0, 8
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 2(a0)
+; CHECK-NEXT:    lbu a4, 3(a0)
+; CHECK-NEXT:    lbu a5, 4(a0)
+; CHECK-NEXT:    lbu a6, 5(a0)
+; CHECK-NEXT:    lbu a7, 6(a0)
+; CHECK-NEXT:    lbu t0, 7(a0)
+; CHECK-NEXT:    lbu t1, 9(a0)
+; CHECK-NEXT:    lbu t2, 10(a0)
+; CHECK-NEXT:    lbu t3, 11(a0)
+; CHECK-NEXT:    lbu t4, 12(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    lbu t5, 13(a0)
+; CHECK-NEXT:    lbu t6, 14(a0)
+; CHECK-NEXT:    lbu a0, 15(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v10, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v9, t0
-; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v10, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v9, t1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
@@ -1277,38 +1277,38 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_gather:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 22(a0)
-; CHECK-NEXT:    lbu a3, 31(a0)
-; CHECK-NEXT:    lbu a4, 44(a0)
-; CHECK-NEXT:    lbu a5, 55(a0)
-; CHECK-NEXT:    lbu a6, 623(a0)
-; CHECK-NEXT:    lbu a7, 75(a0)
-; CHECK-NEXT:    lbu t0, 93(a0)
-; CHECK-NEXT:    lbu t1, 105(a0)
-; CHECK-NEXT:    lbu t2, 161(a0)
-; CHECK-NEXT:    lbu t3, 124(a0)
-; CHECK-NEXT:    lbu t4, 163(a0)
-; CHECK-NEXT:    lbu t5, 144(a0)
-; CHECK-NEXT:    lbu t6, 154(a0)
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 22(a0)
+; CHECK-NEXT:    lbu a4, 31(a0)
+; CHECK-NEXT:    lbu a5, 44(a0)
+; CHECK-NEXT:    lbu a6, 55(a0)
+; CHECK-NEXT:    lbu a7, 623(a0)
+; CHECK-NEXT:    lbu t0, 75(a0)
+; CHECK-NEXT:    lbu t1, 93(a0)
+; CHECK-NEXT:    lbu t2, 105(a0)
+; CHECK-NEXT:    lbu t3, 161(a0)
+; CHECK-NEXT:    lbu t4, 124(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    lbu t5, 163(a0)
+; CHECK-NEXT:    lbu t6, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v10, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v9, t0
-; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v10, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v9, t1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
@@ -1375,17 +1375,17 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; CHECK-NEXT:    lbu a3, 105(a0)
 ; CHECK-NEXT:    lbu a4, 161(a0)
 ; CHECK-NEXT:    lbu a5, 124(a0)
-; CHECK-NEXT:    lbu a6, 163(a0)
-; CHECK-NEXT:    lbu a7, 144(a0)
-; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    lbu a1, 163(a0)
+; CHECK-NEXT:    lbu a6, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v8, v8, a7
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %p9 = getelementptr i8, ptr %p, i32 82
@@ -1424,18 +1424,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; CHECK-NEXT:    lbu a2, 22(a0)
 ; CHECK-NEXT:    lbu a3, 31(a0)
 ; CHECK-NEXT:    lbu a4, 44(a0)
-; CHECK-NEXT:    lbu a5, 55(a0)
-; CHECK-NEXT:    lbu a6, 623(a0)
-; CHECK-NEXT:    lbu a7, 75(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    lbu a5, 55(a0)
+; CHECK-NEXT:    lbu a6, 623(a0)
+; CHECK-NEXT:    lbu a0, 75(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %p2 = getelementptr i8, ptr %p, i32 1
@@ -1470,24 +1470,24 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_undef_edges:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 31
-; CHECK-NEXT:    lbu a2, 44(a0)
-; CHECK-NEXT:    lbu a3, 55(a0)
-; CHECK-NEXT:    lbu a4, 623(a0)
-; CHECK-NEXT:    lbu a5, 75(a0)
-; CHECK-NEXT:    lbu a6, 93(a0)
-; CHECK-NEXT:    lbu a7, 105(a0)
-; CHECK-NEXT:    lbu t0, 161(a0)
+; CHECK-NEXT:    addi a2, a0, 82
+; CHECK-NEXT:    lbu a3, 44(a0)
+; CHECK-NEXT:    lbu a4, 55(a0)
+; CHECK-NEXT:    lbu a5, 623(a0)
+; CHECK-NEXT:    lbu a6, 75(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a1), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
+; CHECK-NEXT:    lbu a1, 93(a0)
+; CHECK-NEXT:    lbu a7, 105(a0)
+; CHECK-NEXT:    lbu a0, 161(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vlse8.v v9, (a2), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vslide1down.vx v10, v8, a5
-; CHECK-NEXT:    vslide1down.vx v8, v9, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v10, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1530,30 +1530,30 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 44(a0)
-; CHECK-NEXT:    lbu a3, 55(a0)
-; CHECK-NEXT:    lbu a4, 75(a0)
-; CHECK-NEXT:    lbu a5, 93(a0)
-; CHECK-NEXT:    lbu a6, 124(a0)
-; CHECK-NEXT:    lbu a7, 144(a0)
-; CHECK-NEXT:    lbu t0, 154(a0)
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 44(a0)
+; CHECK-NEXT:    lbu a4, 55(a0)
+; CHECK-NEXT:    lbu a5, 75(a0)
+; CHECK-NEXT:    lbu a6, 93(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    lbu a7, 124(a0)
+; CHECK-NEXT:    lbu t0, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vslide1down.vx v10, v8, a4
-; CHECK-NEXT:    vslide1down.vx v8, v9, a5
+; CHECK-NEXT:    vslide1down.vx v10, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v9, a6
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc9..eb95d86e34045b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,25 +7,23 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
-; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
-; ZVE32X-NEXT:    ld a6, 48(a0)
-; ZVE32X-NEXT:    ld a7, 8(a0)
-; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    ld a1, 56(a0)
+; ZVE32X-NEXT:    ld a2, 32(a0)
+; ZVE32X-NEXT:    ld a3, 24(a0)
+; ZVE32X-NEXT:    ld a4, 48(a0)
+; ZVE32X-NEXT:    ld a5, 8(a0)
+; ZVE32X-NEXT:    ld a6, 0(a0)
+; ZVE32X-NEXT:    xor a2, a3, a2
+; ZVE32X-NEXT:    snez a2, a2
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a2
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
 ; ZVE32X-NEXT:    vmerge.vim v9, v8, 1, v0
-; ZVE32X-NEXT:    xor a0, a0, a7
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    xor a2, a6, a5
+; ZVE32X-NEXT:    snez a2, a2
+; ZVE32X-NEXT:    vmv.s.x v10, a2
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
@@ -35,21 +33,23 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vslideup.vi v11, v9, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    ld a2, 80(a0)
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a3
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v11, a0
+; ZVE32X-NEXT:    xor a1, a4, a1
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v11, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    ld a0, 72(a0)
 ; ZVE32X-NEXT:    vmerge.vim v11, v8, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
 ; ZVE32X-NEXT:    vslideup.vi v9, v11, 2
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a0, a0, a2
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 35baa6808db603..e2075e074179c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -812,14 +812,14 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    fld fa4, 32(sp)
+; RV32-NEXT:    fld fa5, 32(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v8
 ; RV32-NEXT:    fld fa3, 40(sp)
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    fcvt.w.d a0, fa4
+; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    fld fa5, 48(sp)
-; RV32-NEXT:    fcvt.w.d a1, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa3
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    fld fa5, 56(sp)
@@ -865,14 +865,14 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    fld fa4, 32(sp)
+; RV64-i32-NEXT:    fld fa5, 32(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v8
 ; RV64-i32-NEXT:    fld fa3, 40(sp)
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    fcvt.l.d a0, fa4
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    fld fa5, 48(sp)
-; RV64-i32-NEXT:    fcvt.l.d a1, fa4
 ; RV64-i32-NEXT:    fcvt.l.d a2, fa3
+; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    fld fa5, 56(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index f42f32e246585e..08cad29ab1b880 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -519,17 +519,17 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x ptr> %ptrs, <4 x i8> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i8:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lbu a1, 0(a1)
-; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vlse8.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse8.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lbu a0, 0(a0)
+; RV64ZVE32F-NEXT:    lbu a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1), <4 x i8> %passthru)
   ret <4 x i8> %v
@@ -1208,17 +1208,17 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x ptr> %ptrs, <4 x i16> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lh a1, 0(a1)
-; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    lh a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a0, 0(a0)
+; RV64ZVE32F-NEXT:    lh a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x i16> %passthru)
   ret <4 x i16> %v
@@ -2257,17 +2257,17 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x ptr> %ptrs, <4 x i32> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lw a1, 0(a1)
-; RV64ZVE32F-NEXT:    lw a2, 0(a2)
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lw a0, 0(a0)
+; RV64ZVE32F-NEXT:    lw a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1), <4 x i32> %passthru)
   ret <4 x i32> %v
@@ -6589,16 +6589,16 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    lw a4, 56(a2)
 ; RV32ZVE32F-NEXT:    lw a5, 48(a2)
 ; RV32ZVE32F-NEXT:    lw a6, 40(a2)
-; RV32ZVE32F-NEXT:    lw a7, 32(a2)
-; RV32ZVE32F-NEXT:    lw t0, 24(a2)
-; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw t2, 8(a2)
+; RV32ZVE32F-NEXT:    lw a7, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    lw t0, 16(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
@@ -7017,14 +7017,14 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru)
 ; RV64ZVE32F-LABEL: mgather_truemask_v4f16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a1)
-; RV64ZVE32F-NEXT:    flh fa4, 0(a2)
-; RV64ZVE32F-NEXT:    flh fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    flh fa4, 0(a0)
+; RV64ZVE32F-NEXT:    flh fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa5
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa4
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa3
@@ -7940,14 +7940,14 @@ define <4 x float> @mgather_truemask_v4f32(<4 x ptr> %ptrs, <4 x float> %passthr
 ; RV64ZVE32F-LABEL: mgather_truemask_v4f32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    flw fa5, 0(a1)
-; RV64ZVE32F-NEXT:    flw fa4, 0(a2)
-; RV64ZVE32F-NEXT:    flw fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    flw fa4, 0(a0)
+; RV64ZVE32F-NEXT:    flw fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa5
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa4
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa3
@@ -11632,16 +11632,16 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    lw a3, 56(a2)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a2)
 ; RV32ZVE32F-NEXT:    lw a5, 40(a2)
-; RV32ZVE32F-NEXT:    lw a6, 32(a2)
-; RV32ZVE32F-NEXT:    lw a7, 24(a2)
-; RV32ZVE32F-NEXT:    lw t0, 16(a2)
-; RV32ZVE32F-NEXT:    lw t1, 8(a2)
+; RV32ZVE32F-NEXT:    lw a6, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw t0, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
@@ -12881,22 +12881,22 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 8(a0)
-; RV64ZVE32F-NEXT:    lh a3, 10(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 24(a0)
-; RV64ZVE32F-NEXT:    lh a6, 26(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 8(a0)
+; RV64ZVE32F-NEXT:    lh a4, 10(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 24(a0)
+; RV64ZVE32F-NEXT:    lh a0, 26(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -12925,23 +12925,23 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_2xSEW_with_offset:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 4
-; RV64ZVE32F-NEXT:    lh a2, 6(a0)
-; RV64ZVE32F-NEXT:    lh a3, 12(a0)
-; RV64ZVE32F-NEXT:    lh a4, 14(a0)
-; RV64ZVE32F-NEXT:    lh a5, 22(a0)
-; RV64ZVE32F-NEXT:    lh a6, 28(a0)
-; RV64ZVE32F-NEXT:    lh a7, 30(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 20
+; RV64ZVE32F-NEXT:    addi a2, a0, 4
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 12(a0)
+; RV64ZVE32F-NEXT:    lh a5, 14(a0)
+; RV64ZVE32F-NEXT:    lh a6, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 20
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 28(a0)
+; RV64ZVE32F-NEXT:    lh a0, 30(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -12970,23 +12970,23 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_unit_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 28
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 24(a0)
-; RV64ZVE32F-NEXT:    lh a4, 26(a0)
-; RV64ZVE32F-NEXT:    lh a5, 22(a0)
-; RV64ZVE32F-NEXT:    lh a6, 16(a0)
-; RV64ZVE32F-NEXT:    lh a7, 18(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 20
+; RV64ZVE32F-NEXT:    addi a2, a0, 28
+; RV64ZVE32F-NEXT:    lh a3, 30(a0)
+; RV64ZVE32F-NEXT:    lh a4, 24(a0)
+; RV64ZVE32F-NEXT:    lh a5, 26(a0)
+; RV64ZVE32F-NEXT:    lh a6, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 20
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 16(a0)
+; RV64ZVE32F-NEXT:    lh a0, 18(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13015,23 +13015,23 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 28
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 22(a0)
-; RV64ZVE32F-NEXT:    lh a5, 14(a0)
-; RV64ZVE32F-NEXT:    lh a6, 4(a0)
-; RV64ZVE32F-NEXT:    lh a7, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 12
+; RV64ZVE32F-NEXT:    addi a2, a0, 28
+; RV64ZVE32F-NEXT:    lh a3, 30(a0)
+; RV64ZVE32F-NEXT:    lh a4, 20(a0)
+; RV64ZVE32F-NEXT:    lh a5, 22(a0)
+; RV64ZVE32F-NEXT:    lh a6, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 12
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13059,22 +13059,22 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 16(a0)
-; RV64ZVE32F-NEXT:    lh a3, 18(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 16(a0)
+; RV64ZVE32F-NEXT:    lh a4, 18(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13105,22 +13105,22 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 18(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 18(a0)
+; RV64ZVE32F-NEXT:    lh a4, 20(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13152,22 +13152,22 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 2
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 18(a0)
-; RV64ZVE32F-NEXT:    lh a4, 20(a0)
-; RV64ZVE32F-NEXT:    lh a5, 10(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    addi a2, a0, 2
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 18(a0)
+; RV64ZVE32F-NEXT:    lh a5, 20(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13202,22 +13202,22 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_4xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 20(a0)
-; RV64ZVE32F-NEXT:    lh a6, 22(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 6(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 20(a0)
+; RV64ZVE32F-NEXT:    lh a0, 22(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13249,22 +13249,22 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_4xSEW_partial_align:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 20(a0)
-; RV64ZVE32F-NEXT:    lh a6, 22(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 6(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 20(a0)
+; RV64ZVE32F-NEXT:    lh a0, 22(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13305,22 +13305,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_rotate:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 10(a0)
-; RV64ZVE32F-NEXT:    lh a2, 12(a0)
-; RV64ZVE32F-NEXT:    lh a3, 14(a0)
-; RV64ZVE32F-NEXT:    lh a4, 2(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
+; RV64ZVE32F-NEXT:    lh a3, 12(a0)
+; RV64ZVE32F-NEXT:    lh a4, 14(a0)
+; RV64ZVE32F-NEXT:    lh a5, 2(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a4
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13352,22 +13352,22 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 4(a0)
-; RV64ZVE32F-NEXT:    lh a2, 6(a0)
-; RV64ZVE32F-NEXT:    lh a3, 2(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 12(a0)
-; RV64ZVE32F-NEXT:    lh a6, 14(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 2(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 12(a0)
+; RV64ZVE32F-NEXT:    lh a0, 14(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13853,12 +13853,12 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    addi a1, a0, 136
 ; RV64ZVE32F-NEXT:    lw a2, 140(a0)
-; RV64ZVE32F-NEXT:    lw a3, 0(a0)
-; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vlse32.v v8, (a1), zero
+; RV64ZVE32F-NEXT:    lw a1, 0(a0)
+; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 34, i64 35, i64 0, i64 1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index aa815e18ac1014..42e52436a7da09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5598,17 +5598,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi sp, sp, -48
-; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi sp, sp, -32
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 32
+; RV32ZVE32F-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
@@ -5617,7 +5616,6 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
-; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
 ; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
@@ -5635,16 +5633,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s2, 56(a2)
 ; RV32ZVE32F-NEXT:    lw s3, 48(a2)
 ; RV32ZVE32F-NEXT:    lw s4, 40(a2)
-; RV32ZVE32F-NEXT:    lw s5, 32(a2)
-; RV32ZVE32F-NEXT:    lw s6, 24(a2)
-; RV32ZVE32F-NEXT:    lw s7, 16(a2)
-; RV32ZVE32F-NEXT:    lw s8, 8(a2)
+; RV32ZVE32F-NEXT:    lw s5, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    lw s6, 16(a2)
+; RV32ZVE32F-NEXT:    lw s7, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
@@ -5682,16 +5680,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
-; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    addi sp, sp, 48
+; RV32ZVE32F-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    addi sp, sp, 32
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.store
 ; RV32ZVE32F-NEXT:    lw a2, 4(a0)
@@ -10227,16 +10224,16 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    lw a2, 56(a1)
 ; RV32ZVE32F-NEXT:    lw a3, 48(a1)
 ; RV32ZVE32F-NEXT:    lw a4, 40(a1)
-; RV32ZVE32F-NEXT:    lw a5, 32(a1)
-; RV32ZVE32F-NEXT:    lw a6, 24(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
-; RV32ZVE32F-NEXT:    lw t0, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 8(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a1), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    lw a6, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 32(a1)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 19f3d3ce19fa4c..7be015e26b0983 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -19,9 +19,9 @@ define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -38,9 +38,9 @@ define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
@@ -61,9 +61,9 @@ define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -80,9 +80,9 @@ define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
@@ -103,9 +103,9 @@ define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -122,9 +122,9 @@ define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
@@ -147,9 +147,9 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -166,9 +166,9 @@ define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
@@ -189,9 +189,9 @@ define <2 x float> @vp_nearbyint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> %m, i32 %evl)
   ret <2 x float> %v
@@ -208,9 +208,9 @@ define <2 x float> @vp_nearbyint_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x float> %v
@@ -231,9 +231,9 @@ define <4 x float> @vp_nearbyint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
   ret <4 x float> %v
@@ -250,9 +250,9 @@ define <4 x float> @vp_nearbyint_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x float> %v
@@ -275,9 +275,9 @@ define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -294,9 +294,9 @@ define <8 x float> @vp_nearbyint_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x float> %v
@@ -319,9 +319,9 @@ define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -338,9 +338,9 @@ define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x float> %v
@@ -361,9 +361,9 @@ define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl)
   ret <2 x double> %v
@@ -380,9 +380,9 @@ define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x double> %v
@@ -405,9 +405,9 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -424,9 +424,9 @@ define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x double> %v
@@ -449,9 +449,9 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -468,9 +468,9 @@ define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x double> %v
@@ -493,9 +493,9 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -512,9 +512,9 @@ define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x double> %v
@@ -537,9 +537,9 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -556,9 +556,9 @@ define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x double> %v
@@ -617,9 +617,9 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -660,9 +660,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index 7dcd4c41998279..ed2ed2a2ebfaa0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -5,8 +5,8 @@
 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -35,8 +35,8 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -73,8 +73,8 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -104,8 +104,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -144,8 +144,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vi_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -175,8 +175,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vi_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
@@ -214,8 +214,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -244,8 +244,8 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -282,8 +282,8 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -313,8 +313,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
@@ -353,8 +353,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vfpzero_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -384,8 +384,8 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vfpzero_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
index a4a5917fd4f9e6..b1726be941e3ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
@@ -768,8 +768,8 @@ define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) {
 define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lb a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -796,8 +796,8 @@ define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lh a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -824,8 +824,8 @@ define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwadd_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -853,9 +853,9 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index bc0bf5dd76ad45..f6d9695c514907 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -769,8 +769,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -801,8 +801,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -833,8 +833,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -865,9 +865,9 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index 2abd34f01c14c0..c87584ab635135 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -883,9 +883,9 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 921037db2ea99e..a56984577ea749 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -793,8 +793,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i8(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -827,8 +827,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i16(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -861,8 +861,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
index 154093d759d6dd..2782a5fbb1eaed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
@@ -769,8 +769,8 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) {
 define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lb a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -798,8 +798,8 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lh a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -827,8 +827,8 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -856,9 +856,9 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index a084b5383b4030..ccbc26c84d80d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -770,8 +770,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -803,8 +803,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -836,8 +836,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -868,9 +868,9 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index b78b8663eac90b..02cfd3de6b4db7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -1012,77 +1012,99 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    li a3, 36
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 18
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    vslidedown.vx v7, v0, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a3, a2, a1
 ; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a4, a0, 5
-; CHECK-NEXT:    add a0, a4, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 10
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 19
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1092,49 +1114,63 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 18
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    li a1, 36
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1162,19 +1198,19 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 69c76152910e86..72a47ca2a60572 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -1012,77 +1012,99 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    li a3, 36
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 18
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    vslidedown.vx v7, v0, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a3, a2, a1
 ; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a4, a0, 5
-; CHECK-NEXT:    add a0, a4, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 10
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 19
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1092,49 +1114,63 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 18
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    li a1, 36
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1162,19 +1198,19 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f90237b8d7e95d..f88a9b3081a1a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -19,9 +19,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.experimental.constrained.nearbyint.nxv1f16(<vscale x 1 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x half> %r
@@ -42,9 +42,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.experimental.constrained.nearbyint.nxv2f16(<vscale x 2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x half> %r
@@ -65,9 +65,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.experimental.constrained.nearbyint.nxv4f16(<vscale x 4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x half> %r
@@ -88,9 +88,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.experimental.constrained.nearbyint.nxv8f16(<vscale x 8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %r
@@ -111,9 +111,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.experimental.constrained.nearbyint.nxv16f16(<vscale x 16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x half> %r
@@ -134,9 +134,9 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.experimental.constrained.nearbyint.nxv32f16(<vscale x 32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 32 x half> %r
@@ -157,9 +157,9 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x float> @llvm.experimental.constrained.nearbyint.nxv1f32(<vscale x 1 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x float> %r
@@ -180,9 +180,9 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x float> @llvm.experimental.constrained.nearbyint.nxv2f32(<vscale x 2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x float> %r
@@ -203,9 +203,9 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x float> @llvm.experimental.constrained.nearbyint.nxv4f32(<vscale x 4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %r
@@ -226,9 +226,9 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x float> @llvm.experimental.constrained.nearbyint.nxv8f32(<vscale x 8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %r
@@ -249,9 +249,9 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x float> @llvm.experimental.constrained.nearbyint.nxv16f32(<vscale x 16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x float> %r
@@ -272,9 +272,9 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x double> @llvm.experimental.constrained.nearbyint.nxv1f64(<vscale x 1 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x double> %r
@@ -295,9 +295,9 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x double> @llvm.experimental.constrained.nearbyint.nxv2f64(<vscale x 2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %r
@@ -318,9 +318,9 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x double> @llvm.experimental.constrained.nearbyint.nxv4f64(<vscale x 4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %r
@@ -341,9 +341,9 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x double> @llvm.experimental.constrained.nearbyint.nxv8f64(<vscale x 8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 9aa356b9b65e0b..9e14852305caa1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -15,9 +15,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half> %x)
   ret <vscale x 1 x half> %a
@@ -35,9 +35,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half> %x)
   ret <vscale x 2 x half> %a
@@ -55,9 +55,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x half> %a
@@ -75,9 +75,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half> %x)
   ret <vscale x 8 x half> %a
@@ -95,9 +95,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half> %x)
   ret <vscale x 16 x half> %a
@@ -115,9 +115,9 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.nearbyint.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a
@@ -135,9 +135,9 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x float> @llvm.nearbyint.nxv1f32(<vscale x 1 x float> %x)
   ret <vscale x 1 x float> %a
@@ -155,9 +155,9 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.nearbyint.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x float> %a
@@ -175,9 +175,9 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x float> %a
@@ -195,9 +195,9 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x float> @llvm.nearbyint.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x float> %a
@@ -215,9 +215,9 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x float> @llvm.nearbyint.nxv16f32(<vscale x 16 x float> %x)
   ret <vscale x 16 x float> %a
@@ -235,9 +235,9 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.nearbyint.nxv1f64(<vscale x 1 x double> %x)
   ret <vscale x 1 x double> %a
@@ -255,9 +255,9 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %x)
   ret <vscale x 2 x double> %a
@@ -275,9 +275,9 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.nearbyint.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x double> %a
@@ -295,9 +295,9 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x double> @llvm.nearbyint.nxv8f64(<vscale x 8 x double> %x)
   ret <vscale x 8 x double> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 277cd7dcdabce5..249f765971b095 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -960,108 +960,87 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a1, a3, 3
 ; CHECK-NEXT:    add a5, a0, a1
+; CHECK-NEXT:    srli a6, a3, 3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vl8re64.v v16, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a1, a4, a3
+; CHECK-NEXT:    sltu a7, a4, a1
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a7, a7, a1
+; CHECK-NEXT:    li a1, 63
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    slli a5, a5, 5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
-; CHECK-NEXT:    sub a1, a4, a3
-; CHECK-NEXT:    sltu a6, a4, a1
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a1
-; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
+; CHECK-NEXT:    vslidedown.vx v0, v0, a6
+; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v16, a1, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
+; CHECK-NEXT:    slli a5, a5, 5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1072,66 +1051,50 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v16, v8, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1147,109 +1110,89 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
+; CHECK-NEXT:    slli a5, a3, 3
+; CHECK-NEXT:    srli a1, a3, 3
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    add a1, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a1, a4, a3
 ; CHECK-NEXT:    sltu a6, a4, a1
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a6, a6, a1
 ; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a0, a0, a5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
@@ -1258,63 +1201,56 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index e12f1cf7603b84..e260ae5344e446 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -1216,35 +1216,20 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
 ;
 ; RV64-LABEL: mgather_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 3
-; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    vl8re64.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv8r.v v16, v8
-; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV64-NEXT:    vluxei64.v v24, (zero), v16, v0.t
+; RV64-NEXT:    vluxei64.v v24, (zero), v8, v0.t
+; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a1, a0, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vluxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vs8r.v v24, (a2)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %p0 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> undef, <vscale x 8 x ptr> %ptrs0, i64 0)
   %p1 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> %p0, <vscale x 8 x ptr> %ptrs1, i64 8)
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 0e09f59b6a20fe..fc8fdf4aaafe24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1714,8 +1714,8 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 ; RV64-NEXT:    vl8re64.v v24, (a0)
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re64.v v16, (a1)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    vl8re64.v v16, (a1)
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 8bc2334282653f..ebe89817630d02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -23,9 +23,9 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16:
@@ -42,11 +42,11 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -63,9 +63,9 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16_unmasked:
@@ -80,11 +80,11 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -105,9 +105,9 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16:
@@ -124,11 +124,11 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -145,9 +145,9 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16_unmasked:
@@ -162,11 +162,11 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -187,9 +187,9 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16:
@@ -208,11 +208,11 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -229,9 +229,9 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16_unmasked:
@@ -246,11 +246,11 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -273,9 +273,9 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16:
@@ -294,11 +294,11 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -315,9 +315,9 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16_unmasked:
@@ -332,11 +332,11 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -359,9 +359,9 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16:
@@ -380,11 +380,11 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -401,9 +401,9 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16_unmasked:
@@ -418,11 +418,11 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -445,9 +445,9 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
@@ -458,7 +458,7 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -488,29 +488,30 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
+; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -531,9 +532,9 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
@@ -589,11 +590,11 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -618,9 +619,9 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32(<vscale x 1 x float> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x float> %v
@@ -637,9 +638,9 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32_unmasked(<vscale x 1 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x float> %v
@@ -660,9 +661,9 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32(<vscale x 2 x float> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x float> %v
@@ -679,9 +680,9 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
@@ -704,9 +705,9 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32(<vscale x 4 x float> %va, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -723,9 +724,9 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32_unmasked(<vscale x 4 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x float> %v
@@ -748,9 +749,9 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32(<vscale x 8 x float> %va, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -767,9 +768,9 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32_unmasked(<vscale x 8 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x float> %v
@@ -792,9 +793,9 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32(<vscale x 16 x float> %va, <
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -811,9 +812,9 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32_unmasked(<vscale x 16 x floa
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x float> %v
@@ -834,9 +835,9 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <v
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x double> %v
@@ -853,9 +854,9 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64_unmasked(<vscale x 1 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x double> %v
@@ -878,9 +879,9 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -897,9 +898,9 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
@@ -922,9 +923,9 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -941,9 +942,9 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64_unmasked(<vscale x 4 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x double> %v
@@ -966,9 +967,9 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -985,9 +986,9 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64_unmasked(<vscale x 7 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 7 x double> %v
@@ -1010,9 +1011,9 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1029,9 +1030,9 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64_unmasked(<vscale x 8 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1046,16 +1047,15 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vslidedown.vx v6, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1063,60 +1063,41 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -1153,9 +1134,9 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 897bfdea69f1f4..cc967396153bc1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2203,17 +2203,17 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    add a1, sp, a1
 ; ZVFH-NEXT:    addi a1, a1, 16
 ; ZVFH-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    slli a3, a1, 3
-; ZVFH-NEXT:    add a3, a0, a3
-; ZVFH-NEXT:    vl8re16.v v8, (a3)
-; ZVFH-NEXT:    slli a3, a1, 2
+; ZVFH-NEXT:    csrr a3, vlenb
+; ZVFH-NEXT:    srli a1, a3, 1
+; ZVFH-NEXT:    slli a4, a3, 3
+; ZVFH-NEXT:    add a4, a0, a4
+; ZVFH-NEXT:    vl8re16.v v8, (a4)
+; ZVFH-NEXT:    slli a3, a3, 2
 ; ZVFH-NEXT:    sub a4, a2, a3
 ; ZVFH-NEXT:    sltu a5, a2, a4
 ; ZVFH-NEXT:    addi a5, a5, -1
 ; ZVFH-NEXT:    and a4, a5, a4
-; ZVFH-NEXT:    srli a1, a1, 1
+; ZVFH-NEXT:    vsetvli a5, zero, e8, m1, ta, ma
 ; ZVFH-NEXT:    vl8re16.v v0, (a0)
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
@@ -2249,152 +2249,133 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 34
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 18
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    slli a1, a3, 3
-; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a1)
+; ZVFHMIN-NEXT:    srli a1, a3, 1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v18, v0, a1
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a4, a0, a4
+; ZVFHMIN-NEXT:    vl8re16.v v0, (a4)
 ; ZVFHMIN-NEXT:    slli a5, a3, 2
-; ZVFHMIN-NEXT:    sub a1, a2, a5
-; ZVFHMIN-NEXT:    sltu a4, a2, a1
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a6, a4, a1
+; ZVFHMIN-NEXT:    sub a4, a2, a5
+; ZVFHMIN-NEXT:    sltu a6, a2, a4
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a6, a6, a4
 ; ZVFHMIN-NEXT:    slli a4, a3, 1
-; ZVFHMIN-NEXT:    sub a1, a6, a4
-; ZVFHMIN-NEXT:    sltu a7, a6, a1
-; ZVFHMIN-NEXT:    addi a7, a7, -1
-; ZVFHMIN-NEXT:    and a7, a7, a1
-; ZVFHMIN-NEXT:    srli a1, a3, 1
-; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    add t0, sp, t0
-; ZVFHMIN-NEXT:    addi t0, t0, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (t0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v0, a1
+; ZVFHMIN-NEXT:    sub a7, a6, a4
+; ZVFHMIN-NEXT:    sltu t0, a6, a7
+; ZVFHMIN-NEXT:    addi t0, t0, -1
+; ZVFHMIN-NEXT:    and a7, t0, a7
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v18, a3
+; ZVFHMIN-NEXT:    vsetvli t0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 26
+; ZVFHMIN-NEXT:    li t0, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 10
-; ZVFHMIN-NEXT:    mul a0, a0, t0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v26, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v20, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB85_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB85_2:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a0, a0, a7
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB85_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v26, a3
+; ZVFHMIN-NEXT:    vslideup.vx v6, v20, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v7, v8
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    vmv1r.v v7, v17
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v17, a3
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 18
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a6, a6, 3
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a6, a6, 4
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 26
+; ZVFHMIN-NEXT:    li a7, 24
 ; ZVFHMIN-NEXT:    mul a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    li a6, 10
-; ZVFHMIN-NEXT:    mul a5, a5, a6
+; ZVFHMIN-NEXT:    slli a5, a5, 4
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB85_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB85_6:
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    li a5, 26
+; ZVFHMIN-NEXT:    li a5, 24
 ; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
@@ -2402,8 +2383,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 34
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -3494,109 +3474,93 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul t2, a3, a1
-; CHECK-NEXT:    slli t1, a3, 3
+; CHECK-NEXT:    slli a7, a3, 3
 ; CHECK-NEXT:    srli a4, a3, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v0, a4
 ; CHECK-NEXT:    srli a1, a3, 3
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    add a5, a2, t1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    slli t0, a3, 4
+; CHECK-NEXT:    add a5, a2, a7
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    slli a5, a3, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    mv a7, a6
+; CHECK-NEXT:    mv t1, a6
 ; CHECK-NEXT:    bltu a6, a5, .LBB171_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a7, a5
+; CHECK-NEXT:    mv t1, a5
 ; CHECK-NEXT:  .LBB171_2:
 ; CHECK-NEXT:    add t2, a2, t2
-; CHECK-NEXT:    add t1, a0, t1
+; CHECK-NEXT:    add a7, a0, a7
 ; CHECK-NEXT:    add t0, a2, t0
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a2, a7, a3
-; CHECK-NEXT:    sltu t3, a7, a2
+; CHECK-NEXT:    vl8re64.v v8, (a2)
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a2, t1, a3
+; CHECK-NEXT:    sltu t3, t1, a2
 ; CHECK-NEXT:    addi t3, t3, -1
 ; CHECK-NEXT:    and a2, t3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
-; CHECK-NEXT:    bltu a7, a3, .LBB171_4
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v6, v8, v16, v0.t
+; CHECK-NEXT:    bltu t1, a3, .LBB171_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv t1, a3
 ; CHECK-NEXT:  .LBB171_4:
-; CHECK-NEXT:    vl8re64.v v8, (t2)
+; CHECK-NEXT:    vl8re64.v v16, (t2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (t1)
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v18, v7, a1
+; CHECK-NEXT:    vsetvli zero, t1, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li t1, 24
-; CHECK-NEXT:    mul a2, a2, t1
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v18, v7, a1
-; CHECK-NEXT:    vl8re64.v v8, (t0)
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v17, v24, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v17, v24, v8, v0.t
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; CHECK-NEXT:    sub a0, a6, a5
-; CHECK-NEXT:    sltu a2, a6, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    vl8re64.v v8, (t0)
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    sub a2, a6, a5
+; CHECK-NEXT:    sltu a5, a6, a2
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    addi a0, a5, -1
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vslideup.vx v17, v6, a1
 ; CHECK-NEXT:    mv a2, a0
 ; CHECK-NEXT:    bltu a0, a3, .LBB171_6
@@ -3605,13 +3569,6 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:  .LBB171_6:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    add a2, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
@@ -3623,13 +3580,12 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -3641,7 +3597,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 7fd77c050b2957..85f5ffd784e982 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1092,7 +1092,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1102,23 +1102,23 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl8r.v v8, (a4)
-; CHECK-NEXT:    vl8r.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
+; CHECK-NEXT:    sub a2, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    vl8r.v v24, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB96_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1128,7 +1128,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vmv1r.v v8, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2248,17 +2248,17 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re32.v v8, (a3)
-; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    srli a1, a3, 2
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a2, a3
 ; CHECK-NEXT:    sltu a5, a2, a4
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index a41c2621161366..0f47236d660075 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -1125,22 +1125,22 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    add a6, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a5, a4, a1
-; CHECK-NEXT:    sltu a6, a4, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    srli a6, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v8, (a3)
 ; CHECK-NEXT:    vl8re64.v v16, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -1150,8 +1150,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 292f27794f378a..bacf9bae83ed7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1125,22 +1125,22 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    add a6, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a5, a4, a1
-; CHECK-NEXT:    sltu a6, a4, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    srli a6, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v8, (a3)
 ; CHECK-NEXT:    vl8re64.v v16, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -1150,8 +1150,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 76efdda15bf77b..26f7c56f05cecf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -369,14 +369,14 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl8r.v v16, (a4)
-; CHECK-NEXT:    vl8r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    sub a2, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    vl8r.v v8, (a0)
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    bltu a3, a1, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 0a5e50160fbc93..312378d3937378 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -373,10 +373,10 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
@@ -430,10 +430,10 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
@@ -716,10 +716,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 25aa3a7081a166..9d5ff00fd597d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -242,7 +242,6 @@ define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    andi a1, a1, 2
 ; CHECK-NEXT:    beqz a1, .LBB5_4
 ; CHECK-NEXT:  .LBB5_2: # %if.then4
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
 ; CHECK-NEXT:    vlse64.v v9, (a0), zero
@@ -631,22 +630,22 @@ declare <vscale x 2 x i32> @llvm.riscv.vwadd.w.nxv2i32.nxv2i16(<vscale x 2 x i32
 define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-LABEL: vlmax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB12_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a5, 0
-; CHECK-NEXT:    slli a4, a6, 3
+; CHECK-NEXT:    li a4, 0
+; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
+; CHECK-NEXT:    slli a5, a6, 3
 ; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vle64.v v9, (a3)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    blt a5, a0, .LBB12_2
+; CHECK-NEXT:    add a4, a4, a6
+; CHECK-NEXT:    add a1, a1, a5
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add a2, a2, a5
+; CHECK-NEXT:    blt a4, a0, .LBB12_2
 ; CHECK-NEXT:  .LBB12_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -678,18 +677,18 @@ for.end:                                          ; preds = %for.body, %entry
 define void @vector_init_vlmax(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vlmax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB13_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB13_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB13_2
+; CHECK-NEXT:    blt a2, a0, .LBB13_2
 ; CHECK-NEXT:  .LBB13_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -714,20 +713,20 @@ for.end:                                          ; preds = %for.body, %entry
 define void @vector_init_vsetvli_N(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vsetvli_N:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB14_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    vsetvli a3, a0, e64, m1, ta, ma
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB14_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB14_2
+; CHECK-NEXT:    blt a2, a0, .LBB14_2
 ; CHECK-NEXT:  .LBB14_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 12bb4d27b0f979..da0c1cfb50097c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -699,3 +699,27 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.nxv2i32.i32.i64(<vscale x 2 x i32>,
 declare <vscale x 2 x i1> @llvm.riscv.vmor.nxv2i1.i64(<vscale x 2 x i1>, <vscale x 2 x i1>, i64)
 declare void @llvm.riscv.vse.mask.nxv2i32.i64(<vscale x 2 x i32>, ptr nocapture, <vscale x 2 x i1>, i64)
 declare void @llvm.riscv.vse.nxv2i32.i64(<vscale x 2 x i32>, ptr nocapture, i64)
+
+define <vscale x 2 x i32> @avl_undef1(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>) {
+; CHECK-LABEL: avl_undef1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v10
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    <vscale x 2 x i32> %1,
+    <vscale x 2 x i32> %2,
+    i64 undef
+  )
+  ret <vscale x 2 x i32> %a
+}
+
+define i64 @avl_undef2() {
+; CHECK-LABEL: avl_undef2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.riscv.vsetvli(i64 poison, i64 2, i64 7)
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
index f658a2c6b24a6c..c3b19b59ec3d68 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
@@ -11,9 +11,10 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, pt
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vadd.vv v12, v12, v12
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vs4r.v v12, (a0)
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %index = add <vscale x 4 x i64> %x, %x
   store <vscale x 4 x i64> %index, ptr %y
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 87f7a5badd0105..7511e5953dac14 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-int8,+amx-bf16,+avx512f -verify-machineinstrs | FileCheck %s
 
-define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
+define void @test_amx(ptr %pointer, ptr %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
@@ -29,25 +29,178 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
-  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
+  %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride)
   %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
   %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
   %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
   %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
   %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
-  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
+  %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d4)
 
   ret void
 }
 
 declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
+
+define void @PR90954(ptr %0, ptr %1, i32 %2) {
+; CHECK-LABEL: PR90954:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movw $64, %cx
+; CHECK-NEXT:    movw $16, %di
+; CHECK-NEXT:    movb $1, %r8b
+; CHECK-NEXT:    movl $64, %r9d
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB1_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_5: # in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    incq %r14
+; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:  .LBB1_1: # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB1_2 Depth 2
+; CHECK-NEXT:    movslq %ebx, %r15
+; CHECK-NEXT:    leaq (%rsi,%r15,4), %r15
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_2 Depth=2
+; CHECK-NEXT:    tilestored %tmm1, (%r15,%rax)
+; CHECK-NEXT:    incq %r13
+; CHECK-NEXT:    addq $64, %r15
+; CHECK-NEXT:    decq %r12
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  .LBB1_2: # Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    tilezero %tmm1
+; CHECK-NEXT:    testb %r8b, %r8b
+; CHECK-NEXT:    jne .LBB1_4
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB1_2 Depth=2
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloadd (%r10,%r9), %tmm1
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloadd (%r11,%r9), %tmm2
+; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    jmp .LBB1_4
+  %4 = shl i32 %2, 4
+  %5 = icmp eq i64 0, 0
+  br label %6
+
+6:                                                ; preds = %31, %3
+  %7 = phi i64 [ 0, %3 ], [ %32, %31 ]
+  %8 = trunc nuw nsw i64 %7 to i32
+  %9 = mul i32 %4, %8
+  %10 = mul i32 0, %8
+  %11 = sext i32 %9 to i64
+  %12 = getelementptr inbounds i32, ptr %1, i64 %11
+  br label %13
+
+13:                                               ; preds = %25, %6
+  %14 = phi i64 [ %29, %25 ], [ 0, %6 ]
+  %15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+  %16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15)
+  %17 = shl nsw i64 %14, 4
+  %18 = getelementptr i32, ptr %0, i64 %17
+  br i1 %5, label %25, label %19
+
+19:                                               ; preds = %13
+  %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
+  %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+  %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+  %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22)
+  %24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23)
+  br label %25
+
+25:                                               ; preds = %19, %13
+  %26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ]
+  %27 = getelementptr inbounds i32, ptr %12, i64 %17
+  %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26)
+  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28)
+  %29 = add nuw nsw i64 %14, 1
+  %30 = icmp eq i64 %29, 0
+  br i1 %30, label %31, label %13
+
+31:                                               ; preds = %25
+  %32 = add nuw nsw i64 %7, 1
+  br label %6
+}
+
+declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
+declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
diff --git a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
index d980e60e5fc55f..12beeeacc53098 100644
--- a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
+++ b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
@@ -2,8 +2,8 @@
 ;; sequence. It uses prefixes to allow linker relaxation. We need to disable
 ;; prefix or nop padding for it. For simplicity and consistency, disable for
 ;; Local Dynamic and 32-bit as well.
-; RUN: llc -mtriple=i386 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i386 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,X64 %s
 
 @gd = external thread_local global i32
 @ld = internal thread_local global i32 0
@@ -11,17 +11,17 @@
 define i32 @tls_get_addr() {
 ; CHECK-LABEL: tls_get_addr:
 ; CHECK: #noautopadding
-; 32: leal gd@TLSGD(,%ebx), %eax
-; 32: calll ___tls_get_addr@PLT
-; 64: data16
-; 64: leaq gd@TLSGD(%rip), %rdi
-; 64: callq __tls_get_addr@PLT
+; X86: leal gd@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
+; X64: data16
+; X64: leaq gd@TLSGD(%rip), %rdi
+; X64: callq __tls_get_addr@PLT
 ; CHECK: #autopadding
 ; CHECK: #noautopadding
-; 32: leal ld@TLSLDM(%ebx), %eax
-; 32: calll ___tls_get_addr@PLT
-; 64: leaq ld@TLSLD(%rip), %rdi
-; 64: callq __tls_get_addr@PLT
+; X86: leal ld@TLSLDM(%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
+; X64: leaq ld@TLSLD(%rip), %rdi
+; X64: callq __tls_get_addr@PLT
 ; CHECK: #autopadding
   %1 = load i32, i32* @gd
   %2 = load i32, i32* @ld
diff --git a/llvm/test/CodeGen/X86/asm-modifier.ll b/llvm/test/CodeGen/X86/asm-modifier.ll
index c121b46f845065..9a69402d221689 100644
--- a/llvm/test/CodeGen/X86/asm-modifier.ll
+++ b/llvm/test/CodeGen/X86/asm-modifier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=i686 < %s | FileCheck %s --check-prefixes=CHECK,32
-; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,64
+; RUN: llc -mtriple=i686 < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,X64
 
 @var = internal global i32 0, align 4
 
@@ -43,20 +43,20 @@ entry:
 }
 
 define void @test_V(ptr %p) {
-; 32-LABEL: test_V:
-; 32:       # %bb.0: # %entry
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-NEXT:    #APP
-; 32-NEXT:    calll __x86_indirect_thunk_eax
-; 32-NEXT:    #NO_APP
-; 32-NEXT:    retl
+; X86-LABEL: test_V:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    #APP
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    retl
 ;
-; 64-LABEL: test_V:
-; 64:       # %bb.0: # %entry
-; 64-NEXT:    #APP
-; 64-NEXT:    callq __x86_indirect_thunk_rdi
-; 64-NEXT:    #NO_APP
-; 64-NEXT:    retq
+; X64-LABEL: test_V:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    #APP
+; X64-NEXT:    callq __x86_indirect_thunk_rdi
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    retq
 entry:
   tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(ptr %p)
   ret void
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 679a908893f6ea..93292990dee02a 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -1,5 +1,5 @@
-# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes DEBUG-LABEL,CHECK
-# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes NODEBUG-LABEL,CHECK
+# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes=CHECK,DEBUG
+# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes=CHECK,NODEBUG
 #
 # This was generated from:
 #
@@ -202,8 +202,8 @@ body:             |
     MOVAPSmr %1, 1, $noreg, 0, $noreg, killed %2 :: (store (s128) into %ir.p2)
     RET 0
 
-    ; DEBUG-LABEL: name: debug
-    ; NODEBUG-LABEL: name: nodebug
+    ; DEBUG: name: debug
+    ; NODEBUG: name: nodebug
     ; CHECK: %1:gr64 = COPY
     ; CHECK: %0:gr64 = COPY
     ; CHECK: MOV8mi
@@ -218,5 +218,5 @@ body:             |
     ; CHECK: %7:gr8 = MOV8rm
     ; CHECK: MOV8mr
     ; CHECK: RET 0
-    ; DEBUG-LABEL: name: nodebug
+    ; DEBUG: name: nodebug
 ...
diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
index 840e127ef21309..a6237ecde4d4fe 100644
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -234,7 +234,7 @@ entry:
   ret <16 x i16> %shuffle
 }
 
-;;;; Cases with undef indicies mixed in the mask
+;;;; Cases with undef indices mixed in the mask
 
 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index 699ea3e4dd4734..fa9e75ff16a5ca 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -113,8 +113,7 @@ define i1 @cmp16_reg_eq_imm8(i16 %a0) {
 define i1 @cmp16_reg_eq_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X86-GENERIC-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
 ; X86-GENERIC-NEXT:    sete %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -177,12 +176,11 @@ define i1 @cmp16_reg_eq_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
-; X86-GENERIC-NEXT:    sete %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -191,24 +189,12 @@ define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
-; X86-FAST-NEXT:    sete %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
 ; X64-FAST-NEXT:    sete %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
-; X86-ATOM-NEXT:    sete %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $1024, %di # imm = 0x400
@@ -269,8 +255,7 @@ define i1 @cmp16_reg_sgt_imm8(i16 %a0) {
 define i1 @cmp16_reg_sgt_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X86-GENERIC-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
 ; X86-GENERIC-NEXT:    setge %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -333,12 +318,11 @@ define i1 @cmp16_reg_sgt_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
-; X86-GENERIC-NEXT:    setge %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -347,24 +331,12 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
-; X86-FAST-NEXT:    setge %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
 ; X64-FAST-NEXT:    setge %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
-; X86-ATOM-NEXT:    setge %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $-1023, %di # imm = 0xFC01
@@ -377,8 +349,7 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 define i1 @cmp16_reg_uge_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_uge_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X86-GENERIC-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
 ; X86-GENERIC-NEXT:    setae %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -441,12 +412,11 @@ define i1 @cmp16_reg_uge_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
-; X86-GENERIC-NEXT:    setae %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -455,24 +425,12 @@ define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
-; X86-FAST-NEXT:    setae %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
 ; X64-FAST-NEXT:    setae %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
-; X86-ATOM-NEXT:    setae %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $-1024, %di # imm = 0xFC00
@@ -592,15 +550,13 @@ define i1 @cmp16_load_ne_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_ne_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    cmpw $512, (%eax) # imm = 0x200
 ; X86-GENERIC-NEXT:    setne %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_ne_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    cmpw $512, (%rdi) # imm = 0x200
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -694,15 +650,13 @@ define i1 @cmp16_load_slt_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_slt_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movswl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    cmpw $512, (%eax) # imm = 0x200
 ; X86-GENERIC-NEXT:    setl %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_slt_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movswl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    cmpw $512, (%rdi) # imm = 0x200
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -761,46 +715,18 @@ define i1 @cmp16_load_slt_imm16_minsize(ptr %p0) minsize {
 }
 
 define i1 @cmp16_load_slt_imm16_optsize(ptr %p0) optsize {
-; X86-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movswl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
-; X86-GENERIC-NEXT:    setl %al
-; X86-GENERIC-NEXT:    retl
-;
-; X64-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movswl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
-; X64-GENERIC-NEXT:    setl %al
-; X64-GENERIC-NEXT:    retq
-;
-; X86-FAST-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
-; X86-FAST-NEXT:    setl %al
-; X86-FAST-NEXT:    retl
-;
-; X64-FAST-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
-; X64-FAST-NEXT:    setl %al
-; X64-FAST-NEXT:    retq
-;
-; X86-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-ATOM-NEXT:    cmpw $512, (%eax) # imm = 0x200
-; X86-ATOM-NEXT:    setl %al
-; X86-ATOM-NEXT:    retl
+; X86-LABEL: cmp16_load_slt_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
-; X64-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-ATOM:       # %bb.0:
-; X64-ATOM-NEXT:    cmpw $512, (%rdi) # imm = 0x200
-; X64-ATOM-NEXT:    setl %al
-; X64-ATOM-NEXT:    retq
+; X64-LABEL: cmp16_load_slt_imm16_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
   %ld = load i16, ptr %p0
   %cmp = icmp slt i16 %ld, 512
   ret i1 %cmp
@@ -860,15 +786,13 @@ define i1 @cmp16_load_ule_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_ule_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X86-GENERIC-NEXT:    cmpw $513, (%eax) # imm = 0x201
 ; X86-GENERIC-NEXT:    setb %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_ule_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X64-GENERIC-NEXT:    cmpw $513, (%rdi) # imm = 0x201
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -927,46 +851,18 @@ define i1 @cmp16_load_ule_imm16_minsize(ptr %p0) minsize {
 }
 
 define i1 @cmp16_load_ule_imm16_optsize(ptr %p0) optsize {
-; X86-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
-; X86-GENERIC-NEXT:    setb %al
-; X86-GENERIC-NEXT:    retl
-;
-; X64-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
-; X64-GENERIC-NEXT:    setb %al
-; X64-GENERIC-NEXT:    retq
-;
-; X86-FAST-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
-; X86-FAST-NEXT:    setb %al
-; X86-FAST-NEXT:    retl
-;
-; X64-FAST-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
-; X64-FAST-NEXT:    setb %al
-; X64-FAST-NEXT:    retq
-;
-; X86-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-ATOM-NEXT:    cmpw $513, (%eax) # imm = 0x201
-; X86-ATOM-NEXT:    setb %al
-; X86-ATOM-NEXT:    retl
+; X86-LABEL: cmp16_load_ule_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
 ;
-; X64-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-ATOM:       # %bb.0:
-; X64-ATOM-NEXT:    cmpw $513, (%rdi) # imm = 0x201
-; X64-ATOM-NEXT:    setb %al
-; X64-ATOM-NEXT:    retq
+; X64-LABEL: cmp16_load_ule_imm16_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %ld = load i16, ptr %p0
   %cmp = icmp ule i16 %ld, 512
   ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index bace90cbcccbd2..29751dcfca5d6b 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=32-ALL,32-GOOD-RA
-; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=32-ALL,32-FAST-RA
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=X86-ALL,X86-GOOD-RA
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=X86-ALL,X86-FAST-RA
 
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefix=64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefix=X64-ALL
 
 declare i32 @foo()
 declare i32 @bar(i64)
@@ -23,89 +23,89 @@ declare i32 @bar(i64)
 ; we then promote these copies into independent conditions in GPRs that avoids
 ; repeated saving and restoring logic and can be trivially managed by the
 ; register allocator.
-define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) nounwind {
-; 32-GOOD-RA-LABEL: test_intervening_call:
-; 32-GOOD-RA:       # %bb.0: # %entry
-; 32-GOOD-RA-NEXT:    pushl %ebx
-; 32-GOOD-RA-NEXT:    pushl %esi
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-GOOD-RA-NEXT:    lock cmpxchg8b (%esi)
-; 32-GOOD-RA-NEXT:    setne %bl
-; 32-GOOD-RA-NEXT:    subl $8, %esp
-; 32-GOOD-RA-NEXT:    pushl %edx
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    calll bar@PLT
-; 32-GOOD-RA-NEXT:    addl $16, %esp
-; 32-GOOD-RA-NEXT:    testb %bl, %bl
-; 32-GOOD-RA-NEXT:    jne .LBB0_3
-; 32-GOOD-RA-NEXT:  # %bb.1: # %t
-; 32-GOOD-RA-NEXT:    movl $42, %eax
-; 32-GOOD-RA-NEXT:    jmp .LBB0_2
-; 32-GOOD-RA-NEXT:  .LBB0_3: # %f
-; 32-GOOD-RA-NEXT:    xorl %eax, %eax
-; 32-GOOD-RA-NEXT:  .LBB0_2: # %t
-; 32-GOOD-RA-NEXT:    xorl %edx, %edx
-; 32-GOOD-RA-NEXT:    addl $4, %esp
-; 32-GOOD-RA-NEXT:    popl %esi
-; 32-GOOD-RA-NEXT:    popl %ebx
-; 32-GOOD-RA-NEXT:    retl
+define i64 @test_intervening_call(ptr %foo, i64 %bar, i64 %baz) nounwind {
+; X86-GOOD-RA-LABEL: test_intervening_call:
+; X86-GOOD-RA:       # %bb.0: # %entry
+; X86-GOOD-RA-NEXT:    pushl %ebx
+; X86-GOOD-RA-NEXT:    pushl %esi
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GOOD-RA-NEXT:    lock cmpxchg8b (%esi)
+; X86-GOOD-RA-NEXT:    setne %bl
+; X86-GOOD-RA-NEXT:    subl $8, %esp
+; X86-GOOD-RA-NEXT:    pushl %edx
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    calll bar@PLT
+; X86-GOOD-RA-NEXT:    addl $16, %esp
+; X86-GOOD-RA-NEXT:    testb %bl, %bl
+; X86-GOOD-RA-NEXT:    jne .LBB0_3
+; X86-GOOD-RA-NEXT:  # %bb.1: # %t
+; X86-GOOD-RA-NEXT:    movl $42, %eax
+; X86-GOOD-RA-NEXT:    jmp .LBB0_2
+; X86-GOOD-RA-NEXT:  .LBB0_3: # %f
+; X86-GOOD-RA-NEXT:    xorl %eax, %eax
+; X86-GOOD-RA-NEXT:  .LBB0_2: # %t
+; X86-GOOD-RA-NEXT:    xorl %edx, %edx
+; X86-GOOD-RA-NEXT:    addl $4, %esp
+; X86-GOOD-RA-NEXT:    popl %esi
+; X86-GOOD-RA-NEXT:    popl %ebx
+; X86-GOOD-RA-NEXT:    retl
 ;
-; 32-FAST-RA-LABEL: test_intervening_call:
-; 32-FAST-RA:       # %bb.0: # %entry
-; 32-FAST-RA-NEXT:    pushl %ebx
-; 32-FAST-RA-NEXT:    pushl %esi
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; 32-FAST-RA-NEXT:    lock cmpxchg8b (%esi)
-; 32-FAST-RA-NEXT:    setne %bl
-; 32-FAST-RA-NEXT:    subl $8, %esp
-; 32-FAST-RA-NEXT:    pushl %edx
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    calll bar@PLT
-; 32-FAST-RA-NEXT:    addl $16, %esp
-; 32-FAST-RA-NEXT:    testb %bl, %bl
-; 32-FAST-RA-NEXT:    jne .LBB0_3
-; 32-FAST-RA-NEXT:  # %bb.1: # %t
-; 32-FAST-RA-NEXT:    movl $42, %eax
-; 32-FAST-RA-NEXT:    jmp .LBB0_2
-; 32-FAST-RA-NEXT:  .LBB0_3: # %f
-; 32-FAST-RA-NEXT:    xorl %eax, %eax
-; 32-FAST-RA-NEXT:  .LBB0_2: # %t
-; 32-FAST-RA-NEXT:    xorl %edx, %edx
-; 32-FAST-RA-NEXT:    addl $4, %esp
-; 32-FAST-RA-NEXT:    popl %esi
-; 32-FAST-RA-NEXT:    popl %ebx
-; 32-FAST-RA-NEXT:    retl
+; X86-FAST-RA-LABEL: test_intervening_call:
+; X86-FAST-RA:       # %bb.0: # %entry
+; X86-FAST-RA-NEXT:    pushl %ebx
+; X86-FAST-RA-NEXT:    pushl %esi
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-RA-NEXT:    lock cmpxchg8b (%esi)
+; X86-FAST-RA-NEXT:    setne %bl
+; X86-FAST-RA-NEXT:    subl $8, %esp
+; X86-FAST-RA-NEXT:    pushl %edx
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    calll bar@PLT
+; X86-FAST-RA-NEXT:    addl $16, %esp
+; X86-FAST-RA-NEXT:    testb %bl, %bl
+; X86-FAST-RA-NEXT:    jne .LBB0_3
+; X86-FAST-RA-NEXT:  # %bb.1: # %t
+; X86-FAST-RA-NEXT:    movl $42, %eax
+; X86-FAST-RA-NEXT:    jmp .LBB0_2
+; X86-FAST-RA-NEXT:  .LBB0_3: # %f
+; X86-FAST-RA-NEXT:    xorl %eax, %eax
+; X86-FAST-RA-NEXT:  .LBB0_2: # %t
+; X86-FAST-RA-NEXT:    xorl %edx, %edx
+; X86-FAST-RA-NEXT:    addl $4, %esp
+; X86-FAST-RA-NEXT:    popl %esi
+; X86-FAST-RA-NEXT:    popl %ebx
+; X86-FAST-RA-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_intervening_call:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    pushq %rbx
-; 64-ALL-NEXT:    movq %rsi, %rax
-; 64-ALL-NEXT:    lock cmpxchgq %rdx, (%rdi)
-; 64-ALL-NEXT:    setne %bl
-; 64-ALL-NEXT:    movq %rax, %rdi
-; 64-ALL-NEXT:    callq bar@PLT
-; 64-ALL-NEXT:    testb %bl, %bl
-; 64-ALL-NEXT:    jne .LBB0_2
-; 64-ALL-NEXT:  # %bb.1: # %t
-; 64-ALL-NEXT:    movl $42, %eax
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    retq
-; 64-ALL-NEXT:  .LBB0_2: # %f
-; 64-ALL-NEXT:    xorl %eax, %eax
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_intervening_call:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    pushq %rbx
+; X64-ALL-NEXT:    movq %rsi, %rax
+; X64-ALL-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; X64-ALL-NEXT:    setne %bl
+; X64-ALL-NEXT:    movq %rax, %rdi
+; X64-ALL-NEXT:    callq bar@PLT
+; X64-ALL-NEXT:    testb %bl, %bl
+; X64-ALL-NEXT:    jne .LBB0_2
+; X64-ALL-NEXT:  # %bb.1: # %t
+; X64-ALL-NEXT:    movl $42, %eax
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    retq
+; X64-ALL-NEXT:  .LBB0_2: # %f
+; X64-ALL-NEXT:    xorl %eax, %eax
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    retq
 entry:
-  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %cx = cmpxchg ptr %foo, i64 %bar, i64 %baz seq_cst seq_cst
   %v = extractvalue { i64, i1 } %cx, 0
   %p = extractvalue { i64, i1 } %cx, 1
   call i32 @bar(i64 %v)
@@ -119,62 +119,62 @@ f:
 }
 
 ; Interesting in producing a clobber without any function calls.
-define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) nounwind {
-; 32-ALL-LABEL: test_control_flow:
-; 32-ALL:       # %bb.0: # %entry
-; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-ALL-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; 32-ALL-NEXT:    jle .LBB1_6
-; 32-ALL-NEXT:  # %bb.1: # %loop_start
-; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-ALL-NEXT:    .p2align 4, 0x90
-; 32-ALL-NEXT:  .LBB1_2: # %while.condthread-pre-split.i
-; 32-ALL-NEXT:    # =>This Loop Header: Depth=1
-; 32-ALL-NEXT:    # Child Loop BB1_3 Depth 2
-; 32-ALL-NEXT:    movl (%ecx), %edx
-; 32-ALL-NEXT:    .p2align 4, 0x90
-; 32-ALL-NEXT:  .LBB1_3: # %while.cond.i
-; 32-ALL-NEXT:    # Parent Loop BB1_2 Depth=1
-; 32-ALL-NEXT:    # => This Inner Loop Header: Depth=2
-; 32-ALL-NEXT:    movl %edx, %eax
-; 32-ALL-NEXT:    xorl %edx, %edx
-; 32-ALL-NEXT:    testl %eax, %eax
-; 32-ALL-NEXT:    je .LBB1_3
-; 32-ALL-NEXT:  # %bb.4: # %while.body.i
-; 32-ALL-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; 32-ALL-NEXT:    lock cmpxchgl %eax, (%ecx)
-; 32-ALL-NEXT:    jne .LBB1_2
-; 32-ALL-NEXT:  # %bb.5:
-; 32-ALL-NEXT:    xorl %eax, %eax
-; 32-ALL-NEXT:  .LBB1_6: # %cond.end
-; 32-ALL-NEXT:    retl
+define i32 @test_control_flow(ptr %p, i32 %i, i32 %j) nounwind {
+; X86-ALL-LABEL: test_control_flow:
+; X86-ALL:       # %bb.0: # %entry
+; X86-ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ALL-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-ALL-NEXT:    jle .LBB1_6
+; X86-ALL-NEXT:  # %bb.1: # %loop_start
+; X86-ALL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ALL-NEXT:    .p2align 4, 0x90
+; X86-ALL-NEXT:  .LBB1_2: # %while.condthread-pre-split.i
+; X86-ALL-NEXT:    # =>This Loop Header: Depth=1
+; X86-ALL-NEXT:    # Child Loop BB1_3 Depth 2
+; X86-ALL-NEXT:    movl (%ecx), %edx
+; X86-ALL-NEXT:    .p2align 4, 0x90
+; X86-ALL-NEXT:  .LBB1_3: # %while.cond.i
+; X86-ALL-NEXT:    # Parent Loop BB1_2 Depth=1
+; X86-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; X86-ALL-NEXT:    movl %edx, %eax
+; X86-ALL-NEXT:    xorl %edx, %edx
+; X86-ALL-NEXT:    testl %eax, %eax
+; X86-ALL-NEXT:    je .LBB1_3
+; X86-ALL-NEXT:  # %bb.4: # %while.body.i
+; X86-ALL-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; X86-ALL-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-ALL-NEXT:    jne .LBB1_2
+; X86-ALL-NEXT:  # %bb.5:
+; X86-ALL-NEXT:    xorl %eax, %eax
+; X86-ALL-NEXT:  .LBB1_6: # %cond.end
+; X86-ALL-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_control_flow:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    movl %esi, %eax
-; 64-ALL-NEXT:    cmpl %edx, %esi
-; 64-ALL-NEXT:    jle .LBB1_5
-; 64-ALL-NEXT:    .p2align 4, 0x90
-; 64-ALL-NEXT:  .LBB1_1: # %while.condthread-pre-split.i
-; 64-ALL-NEXT:    # =>This Loop Header: Depth=1
-; 64-ALL-NEXT:    # Child Loop BB1_2 Depth 2
-; 64-ALL-NEXT:    movl (%rdi), %ecx
-; 64-ALL-NEXT:    .p2align 4, 0x90
-; 64-ALL-NEXT:  .LBB1_2: # %while.cond.i
-; 64-ALL-NEXT:    # Parent Loop BB1_1 Depth=1
-; 64-ALL-NEXT:    # => This Inner Loop Header: Depth=2
-; 64-ALL-NEXT:    movl %ecx, %eax
-; 64-ALL-NEXT:    xorl %ecx, %ecx
-; 64-ALL-NEXT:    testl %eax, %eax
-; 64-ALL-NEXT:    je .LBB1_2
-; 64-ALL-NEXT:  # %bb.3: # %while.body.i
-; 64-ALL-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; 64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
-; 64-ALL-NEXT:    jne .LBB1_1
-; 64-ALL-NEXT:  # %bb.4:
-; 64-ALL-NEXT:    xorl %eax, %eax
-; 64-ALL-NEXT:  .LBB1_5: # %cond.end
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_control_flow:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    movl %esi, %eax
+; X64-ALL-NEXT:    cmpl %edx, %esi
+; X64-ALL-NEXT:    jle .LBB1_5
+; X64-ALL-NEXT:    .p2align 4, 0x90
+; X64-ALL-NEXT:  .LBB1_1: # %while.condthread-pre-split.i
+; X64-ALL-NEXT:    # =>This Loop Header: Depth=1
+; X64-ALL-NEXT:    # Child Loop BB1_2 Depth 2
+; X64-ALL-NEXT:    movl (%rdi), %ecx
+; X64-ALL-NEXT:    .p2align 4, 0x90
+; X64-ALL-NEXT:  .LBB1_2: # %while.cond.i
+; X64-ALL-NEXT:    # Parent Loop BB1_1 Depth=1
+; X64-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; X64-ALL-NEXT:    movl %ecx, %eax
+; X64-ALL-NEXT:    xorl %ecx, %ecx
+; X64-ALL-NEXT:    testl %eax, %eax
+; X64-ALL-NEXT:    je .LBB1_2
+; X64-ALL-NEXT:  # %bb.3: # %while.body.i
+; X64-ALL-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; X64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
+; X64-ALL-NEXT:    jne .LBB1_1
+; X64-ALL-NEXT:  # %bb.4:
+; X64-ALL-NEXT:    xorl %eax, %eax
+; X64-ALL-NEXT:  .LBB1_5: # %cond.end
+; X64-ALL-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %i, %j
   br i1 %cmp, label %loop_start, label %cond.end
@@ -183,7 +183,7 @@ loop_start:
   br label %while.condthread-pre-split.i
 
 while.condthread-pre-split.i:
-  %.pr.i = load i32, i32* %p, align 4
+  %.pr.i = load i32, ptr %p, align 4
   br label %while.cond.i
 
 while.cond.i:
@@ -193,7 +193,7 @@ while.cond.i:
 
 while.body.i:
   %.lcssa = phi i32 [ %0, %while.cond.i ]
-  %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
+  %1 = cmpxchg ptr %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
   %2 = extractvalue { i32, i1 } %1, 1
   br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i
 
@@ -207,69 +207,69 @@ cond.end:
 
 ; This one is an interesting case because CMOV doesn't have a chain
 ; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
-define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) nounwind {
-; 32-GOOD-RA-LABEL: test_feed_cmov:
-; 32-GOOD-RA:       # %bb.0: # %entry
-; 32-GOOD-RA-NEXT:    pushl %ebx
-; 32-GOOD-RA-NEXT:    pushl %esi
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-GOOD-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
-; 32-GOOD-RA-NEXT:    sete %bl
-; 32-GOOD-RA-NEXT:    calll foo@PLT
-; 32-GOOD-RA-NEXT:    testb %bl, %bl
-; 32-GOOD-RA-NEXT:    jne .LBB2_2
-; 32-GOOD-RA-NEXT:  # %bb.1: # %entry
-; 32-GOOD-RA-NEXT:    movl %eax, %esi
-; 32-GOOD-RA-NEXT:  .LBB2_2: # %entry
-; 32-GOOD-RA-NEXT:    movl %esi, %eax
-; 32-GOOD-RA-NEXT:    addl $4, %esp
-; 32-GOOD-RA-NEXT:    popl %esi
-; 32-GOOD-RA-NEXT:    popl %ebx
-; 32-GOOD-RA-NEXT:    retl
+define i32 @test_feed_cmov(ptr %addr, i32 %desired, i32 %new) nounwind {
+; X86-GOOD-RA-LABEL: test_feed_cmov:
+; X86-GOOD-RA:       # %bb.0: # %entry
+; X86-GOOD-RA-NEXT:    pushl %ebx
+; X86-GOOD-RA-NEXT:    pushl %esi
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GOOD-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; X86-GOOD-RA-NEXT:    sete %bl
+; X86-GOOD-RA-NEXT:    calll foo@PLT
+; X86-GOOD-RA-NEXT:    testb %bl, %bl
+; X86-GOOD-RA-NEXT:    jne .LBB2_2
+; X86-GOOD-RA-NEXT:  # %bb.1: # %entry
+; X86-GOOD-RA-NEXT:    movl %eax, %esi
+; X86-GOOD-RA-NEXT:  .LBB2_2: # %entry
+; X86-GOOD-RA-NEXT:    movl %esi, %eax
+; X86-GOOD-RA-NEXT:    addl $4, %esp
+; X86-GOOD-RA-NEXT:    popl %esi
+; X86-GOOD-RA-NEXT:    popl %ebx
+; X86-GOOD-RA-NEXT:    retl
 ;
-; 32-FAST-RA-LABEL: test_feed_cmov:
-; 32-FAST-RA:       # %bb.0: # %entry
-; 32-FAST-RA-NEXT:    pushl %ebx
-; 32-FAST-RA-NEXT:    pushl %esi
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-FAST-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
-; 32-FAST-RA-NEXT:    sete %bl
-; 32-FAST-RA-NEXT:    calll foo@PLT
-; 32-FAST-RA-NEXT:    testb %bl, %bl
-; 32-FAST-RA-NEXT:    jne .LBB2_2
-; 32-FAST-RA-NEXT:  # %bb.1: # %entry
-; 32-FAST-RA-NEXT:    movl %eax, %esi
-; 32-FAST-RA-NEXT:  .LBB2_2: # %entry
-; 32-FAST-RA-NEXT:    movl %esi, %eax
-; 32-FAST-RA-NEXT:    addl $4, %esp
-; 32-FAST-RA-NEXT:    popl %esi
-; 32-FAST-RA-NEXT:    popl %ebx
-; 32-FAST-RA-NEXT:    retl
+; X86-FAST-RA-LABEL: test_feed_cmov:
+; X86-FAST-RA:       # %bb.0: # %entry
+; X86-FAST-RA-NEXT:    pushl %ebx
+; X86-FAST-RA-NEXT:    pushl %esi
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; X86-FAST-RA-NEXT:    sete %bl
+; X86-FAST-RA-NEXT:    calll foo@PLT
+; X86-FAST-RA-NEXT:    testb %bl, %bl
+; X86-FAST-RA-NEXT:    jne .LBB2_2
+; X86-FAST-RA-NEXT:  # %bb.1: # %entry
+; X86-FAST-RA-NEXT:    movl %eax, %esi
+; X86-FAST-RA-NEXT:  .LBB2_2: # %entry
+; X86-FAST-RA-NEXT:    movl %esi, %eax
+; X86-FAST-RA-NEXT:    addl $4, %esp
+; X86-FAST-RA-NEXT:    popl %esi
+; X86-FAST-RA-NEXT:    popl %ebx
+; X86-FAST-RA-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_feed_cmov:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    pushq %rbp
-; 64-ALL-NEXT:    pushq %rbx
-; 64-ALL-NEXT:    pushq %rax
-; 64-ALL-NEXT:    movl %edx, %ebx
-; 64-ALL-NEXT:    movl %esi, %eax
-; 64-ALL-NEXT:    lock cmpxchgl %edx, (%rdi)
-; 64-ALL-NEXT:    sete %bpl
-; 64-ALL-NEXT:    callq foo@PLT
-; 64-ALL-NEXT:    testb %bpl, %bpl
-; 64-ALL-NEXT:    cmovnel %ebx, %eax
-; 64-ALL-NEXT:    addq $8, %rsp
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    popq %rbp
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_feed_cmov:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    pushq %rbp
+; X64-ALL-NEXT:    pushq %rbx
+; X64-ALL-NEXT:    pushq %rax
+; X64-ALL-NEXT:    movl %edx, %ebx
+; X64-ALL-NEXT:    movl %esi, %eax
+; X64-ALL-NEXT:    lock cmpxchgl %edx, (%rdi)
+; X64-ALL-NEXT:    sete %bpl
+; X64-ALL-NEXT:    callq foo@PLT
+; X64-ALL-NEXT:    testb %bpl, %bpl
+; X64-ALL-NEXT:    cmovnel %ebx, %eax
+; X64-ALL-NEXT:    addq $8, %rsp
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    popq %rbp
+; X64-ALL-NEXT:    retq
 entry:
-  %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %res = cmpxchg ptr %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1
 
   %rhs = call i32 @foo()
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index f3fbd85dc1ae6e..820e629ab056fb 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -116,8 +116,7 @@ define i1 @length2_eq_const(i8* %X) nounwind {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 6d526dcfa5461b..4e0fb314eaa26c 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -113,8 +113,7 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
 define i1 @length2_eq_const(i8* %X) nounwind {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 4e72f9e7430b19..8939c8037483b6 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -45,8 +45,7 @@ define i1 @length2_eq_const(i8* %X) nounwind optsize {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index 6465c41c38c87f..29a5c9e26421fb 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -41,8 +41,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
 define i1 @length2_eq_const(i8* %X) nounwind optsize {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index 09befee23d4207..179a612610e3fc 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -45,8 +45,7 @@ define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 25494c181d2094..101a38cbdd3d01 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -41,8 +41,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index 8f4d77a2748434..082f8e928f8e35 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -144,8 +144,7 @@ define i1 @length2_eq_const(i8* %X) nounwind {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index f631819e4a297f..3f726d3f139c95 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -139,8 +139,7 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
 define i1 @length2_eq_const(i8* %X) nounwind {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll b/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
index 186b0890ce70c1..efbcb273e62932 100644
--- a/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686 %s -o - | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i686 %s -o - | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
 
 ;; -fpatchable-function-entry=0 -fcf-protection=branch
 define void @f0() "patchable-function-entry"="0" {
@@ -7,8 +7,8 @@ define void @f0() "patchable-function-entry"="0" {
 ; CHECK-NEXT: .Lfunc_begin0:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT:  ret
 ; CHECK-NOT:  .section __patchable_function_entries
   ret void
@@ -22,16 +22,16 @@ define void @f1() "patchable-function-entry"="1" {
 ; CHECK-NEXT: .Lfunc_begin1:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT: .Lpatch0:
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  ret
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f1{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Lpatch0
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Lpatch0
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Lpatch0
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Lpatch0
   ret void
 }
 
@@ -44,17 +44,17 @@ define void @f2_1() "patchable-function-entry"="1" "patchable-function-prefix"="
 ; CHECK-NEXT: .Lfunc_begin2:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  ret
 ; CHECK:      .Lfunc_end2:
 ; CHECK-NEXT: .size f2_1, .Lfunc_end2-f2_1
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f2_1{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Ltmp0
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Ltmp0
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Ltmp0
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Ltmp0
   ret void
 }
 
@@ -74,10 +74,10 @@ define internal void @f1i() "patchable-function-entry"="1" {
 ;; Another basic block has ENDBR, but it doesn't affect our decision to not create .Lpatch0
 ; CHECK:       endbr
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f1i{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Lfunc_begin3
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Lfunc_begin3
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Lfunc_begin3
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Lfunc_begin3
 entry:
   tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
   ret void
@@ -93,8 +93,8 @@ entry:
 ; CHECK-NEXT: .Lfunc_begin{{.*}}:
 ; CHECK-NEXT:   .cfi_startproc
 ; CHECK-NEXT:   # %bb.0:
-; 32-NEXT:      endbr32
-; 64-NEXT:      endbr64
+; X86-NEXT:      endbr32
+; X64-NEXT:      endbr64
 ; CHECK-NEXT:   nop
 ; CHECK-NEXT:   ret
 define void @sanitize_function(ptr noundef %x) "patchable-function-prefix"="1" "patchable-function-entry"="1" !func_sanitize !1 {
diff --git a/llvm/test/CodeGen/X86/patchable-function-entry.ll b/llvm/test/CodeGen/X86/patchable-function-entry.ll
index 8c37f545108015..54ecd8b1e5dafb 100644
--- a/llvm/test/CodeGen/X86/patchable-function-entry.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=i386 %s -o - | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,64 %s
-; RUN: llc -mtriple=x86_64 -function-sections %s -o - | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i386 %s -o - | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
+; RUN: llc -mtriple=x86_64 -function-sections %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
 
 define void @f0() "patchable-function-entry"="0" {
 ; CHECK-LABEL: f0:
@@ -17,10 +17,10 @@ define void @f1() "patchable-function-entry"="1" {
 ; CHECK:       nop
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo",@progbits,f1{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin1
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin1
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin1
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin1
   ret void
 }
 
@@ -31,14 +31,14 @@ define void @f1() "patchable-function-entry"="1" {
 define void @f2() "patchable-function-entry"="2" {
 ; CHECK-LABEL: f2:
 ; CHECK-NEXT: .Lfunc_begin2:
-; 32:          xchgw %ax, %ax
-; 64:          xchgw %ax, %ax
+; X86:          xchgw %ax, %ax
+; X64:          xchgw %ax, %ax
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo",@progbits,f2{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin2
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin2
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin2
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin2
   ret void
 }
 
@@ -46,15 +46,15 @@ $f3 = comdat any
 define void @f3() "patchable-function-entry"="3" comdat {
 ; CHECK-LABEL: f3:
 ; CHECK-NEXT: .Lfunc_begin3:
-; 32:          xchgw %ax, %ax
-; 32-NEXT:     nop
-; 64:          nopl (%rax)
+; X86:          xchgw %ax, %ax
+; X86-NEXT:     nop
+; X64:          nopl (%rax)
 ; CHECK:       ret
 ; CHECK:       .section __patchable_function_entries,"awoG",@progbits,f3,f3,comdat{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin3
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin3
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin3
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin3
   ret void
 }
 
@@ -62,15 +62,15 @@ $f5 = comdat any
 define void @f5() "patchable-function-entry"="5" comdat {
 ; CHECK-LABEL: f5:
 ; CHECK-NEXT: .Lfunc_begin4:
-; 32-COUNT-2:  xchgw %ax, %ax
-; 32-NEXT:     nop
-; 64:          nopl 8(%rax,%rax)
+; X86-COUNT-2:  xchgw %ax, %ax
+; X86-NEXT:     nop
+; X64:          nopl 8(%rax,%rax)
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awoG",@progbits,f5,f5,comdat{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin4
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin4
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin4
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin4
   ret void
 }
 
@@ -91,10 +91,10 @@ define void @f3_2() "patchable-function-entry"="1" "patchable-function-prefix"="
 ; CHECK:      .Lfunc_end5:
 ; CHECK-NEXT: .size f3_2, .Lfunc_end5-f3_2
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f3_2{{$}}
-; 32:         .p2align 2
-; 32-NEXT:    .long .Ltmp0
-; 64:         .p2align 3
-; 64-NEXT:    .quad .Ltmp0
+; X86:         .p2align 2
+; X86-NEXT:    .long .Ltmp0
+; X64:         .p2align 3
+; X64-NEXT:    .quad .Ltmp0
   %frame = alloca i8, i32 16
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
index 43761e3d1e1eb9..aec76a359d267e 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump --no-print-imm-hex --triple=x86_64-apple-macosx -d - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=32,32CFI,XCHG
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=32,MOV
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=32,MOV
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=32,XCHG
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=64
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=X86,X86CFI,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=X86,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=X86,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=X86,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
 
 declare void @callee(ptr)
 
@@ -15,18 +15,18 @@ define void @f0() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f0:
 
-; 32: f0:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f0:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: retl
+; X86-NEXT: retl
+
+; X64: f0:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; X64-NEXT: retq
 
-; 64: f0:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
-; 64-NEXT: retq
-		
   ret void
 }
 
@@ -38,19 +38,19 @@ define void @f1() "patchable-function"="prologue-short-redirect" "frame-pointer"
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f1:
 
-; 32: f1:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f1:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: pushl   %ebp
-
-; 64: f1:
-; 64-NEXT: .seh_proc f1
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw %ax, %ax
-; 64-NEXT: pushq   %rbp
-		
+; X86-NEXT: pushl   %ebp
+
+; X64: f1:
+; X64-NEXT: .seh_proc f1
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw %ax, %ax
+; X64-NEXT: pushq   %rbp
+
   ret void
 }
 
@@ -61,18 +61,18 @@ define void @f2() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f2:
 
-; 32: f2:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f2:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: pushl   %ebp
+; X86-NEXT: pushl   %ebp
+
+; X64: f2:
+; X64-NEXT: .seh_proc f2
+; X64-NEXT: # %bb.0:
+; X64-NEXT: subq    $200, %rsp
 
-; 64: f2:
-; 64-NEXT: .seh_proc f2
-; 64-NEXT: # %bb.0:
-; 64-NEXT: subq    $200, %rsp
-		
   %ptr = alloca i64, i32 20
   call void @callee(ptr %ptr)
   ret void
@@ -85,17 +85,17 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f3:
 
-; 32: f3:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f3:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
-; 32-NEXT: retl
+; X86-NEXT: retl
 
-; 64: f3:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
-; 64-NEXT: retq
+; X64: f3:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
+; X64-NEXT: retq
 
   ret void
 }
@@ -105,16 +105,16 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 ; patchable one.
 ; CHECK-LABEL: f4{{>?}}:
 ; CHECK-NEXT: 8b 0c 37  movl  (%rdi,%rsi), %ecx
-; 32: f4:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f4:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
-; 32-NEXT: pushl   %ebx
+; X86-NEXT: pushl   %ebx
 
-; 64: f4:
-; 64-NEXT: # %bb.0:
-; 64-NOT: xchgw   %ax, %ax
+; X64: f4:
+; X64-NEXT: # %bb.0:
+; X64-NOT: xchgw   %ax, %ax
 
 define i32 @f4(ptr %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
 bb:
@@ -143,15 +143,15 @@ bb21:
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _emptyfunc:
 
-; 32: emptyfunc:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: emptyfunc:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
 
-; 64: emptyfunc:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
+; X64: emptyfunc:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
 
 ; From code: int emptyfunc() {}
 define i32 @emptyfunc() "patchable-function"="prologue-short-redirect" {
@@ -169,15 +169,15 @@ define i32 @emptyfunc() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _jmp_to_start:
 
-; 32: jmp_to_start:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: jmp_to_start:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
 
-; 64: jmp_to_start:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
+; X64: jmp_to_start:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
 
 define dso_local void @jmp_to_start(ptr inreg nocapture noundef %b) "patchable-function"="prologue-short-redirect" {
 entry:
@@ -198,12 +198,12 @@ do.end:                                           ; preds = %do.body
 ; Test that inline asm is properly hotpatched. We currently don't examine the
 ; asm instruction when printing it, thus we always emit patching NOPs.
 
-; 64: inline_asm:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax                        # encoding: [0x66,0x90]
-; 64-NEXT: #APP
-; 64-NEXT: int3                                    # encoding: [0xcc]
-; 64-NEXT: #NO_APP
+; X64: inline_asm:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax                        # encoding: [0x66,0x90]
+; X64-NEXT: #APP
+; X64-NEXT: int3                                    # encoding: [0xcc]
+; X64-NEXT: #NO_APP
 
 define dso_local void @inline_asm() "patchable-function"="prologue-short-redirect" {
 entry:
diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll
index 43c183e80999ef..6dcc03e5568060 100644
--- a/llvm/test/CodeGen/X86/pr32345.ll
+++ b/llvm/test/CodeGen/X86/pr32345.ll
@@ -1,74 +1,74 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X640
-; RUN: llc -O0 -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=6860
-; RUN: llc     -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X64
-; RUN: llc     -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=686
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=X64-O0
+; RUN: llc -O0 -mtriple=i686-unknown             < %s | FileCheck %s -check-prefix=X86-O0
+; RUN: llc     -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=X64
+; RUN: llc     -mtriple=i686-unknown             < %s | FileCheck %s -check-prefix=X86
 
 @var_22 = external dso_local global i16, align 2
 @var_27 = external dso_local global i16, align 2
 
 define void @foo() {
-; X640-LABEL: foo:
-; X640:       # %bb.0: # %bb
-; X640-NEXT:    movzwl var_22, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X640-NEXT:    movzwl var_22, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    subl $16610, %ecx # imm = 0x40E2
-; X640-NEXT:    movl %ecx, %ecx
-; X640-NEXT:    # kill: def $rcx killed $ecx
-; X640-NEXT:    # kill: def $cl killed $rcx
-; X640-NEXT:    sarq %cl, %rax
-; X640-NEXT:    movb %al, %cl
-; X640-NEXT:    # implicit-def: $rax
-; X640-NEXT:    movb %cl, (%rax)
-; X640-NEXT:    retq
+; X64-O0-LABEL: foo:
+; X64-O0:       # %bb.0: # %bb
+; X64-O0-NEXT:    movzwl var_22, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    cltq
+; X64-O0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT:    movzwl var_22, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    cltq
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    subl $16610, %ecx # imm = 0x40E2
+; X64-O0-NEXT:    movl %ecx, %ecx
+; X64-O0-NEXT:    # kill: def $rcx killed $ecx
+; X64-O0-NEXT:    # kill: def $cl killed $rcx
+; X64-O0-NEXT:    sarq %cl, %rax
+; X64-O0-NEXT:    movb %al, %cl
+; X64-O0-NEXT:    # implicit-def: $rax
+; X64-O0-NEXT:    movb %cl, (%rax)
+; X64-O0-NEXT:    retq
 ;
-; 6860-LABEL: foo:
-; 6860:       # %bb.0: # %bb
-; 6860-NEXT:    pushl %ebp
-; 6860-NEXT:    .cfi_def_cfa_offset 8
-; 6860-NEXT:    .cfi_offset %ebp, -8
-; 6860-NEXT:    movl %esp, %ebp
-; 6860-NEXT:    .cfi_def_cfa_register %ebp
-; 6860-NEXT:    andl $-8, %esp
-; 6860-NEXT:    subl $24, %esp
-; 6860-NEXT:    movzwl var_22, %eax
-; 6860-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; 6860-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; 6860-NEXT:    movzwl var_22, %edx
-; 6860-NEXT:    movb var_27, %cl
-; 6860-NEXT:    addb $30, %cl
-; 6860-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; 6860-NEXT:    xorl %eax, %eax
-; 6860-NEXT:    shrdl %cl, %eax, %edx
-; 6860-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; 6860-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:    testb $32, %cl
-; 6860-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:    jne .LBB0_2
-; 6860-NEXT:  # %bb.1: # %bb
-; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; 6860-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:  .LBB0_2: # %bb
-; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; 6860-NEXT:    movb %al, %cl
-; 6860-NEXT:    # implicit-def: $eax
-; 6860-NEXT:    movb %cl, (%eax)
-; 6860-NEXT:    movl %ebp, %esp
-; 6860-NEXT:    popl %ebp
-; 6860-NEXT:    .cfi_def_cfa %esp, 4
-; 6860-NEXT:    retl
+; X86-O0-LABEL: foo:
+; X86-O0:       # %bb.0: # %bb
+; X86-O0-NEXT:    pushl %ebp
+; X86-O0-NEXT:    .cfi_def_cfa_offset 8
+; X86-O0-NEXT:    .cfi_offset %ebp, -8
+; X86-O0-NEXT:    movl %esp, %ebp
+; X86-O0-NEXT:    .cfi_def_cfa_register %ebp
+; X86-O0-NEXT:    andl $-8, %esp
+; X86-O0-NEXT:    subl $24, %esp
+; X86-O0-NEXT:    movzwl var_22, %eax
+; X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movzwl var_22, %edx
+; X86-O0-NEXT:    movb var_27, %cl
+; X86-O0-NEXT:    addb $30, %cl
+; X86-O0-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-O0-NEXT:    xorl %eax, %eax
+; X86-O0-NEXT:    shrdl %cl, %eax, %edx
+; X86-O0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-O0-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:    testb $32, %cl
+; X86-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:    jne .LBB0_2
+; X86-O0-NEXT:  # %bb.1: # %bb
+; X86-O0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:  .LBB0_2: # %bb
+; X86-O0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT:    movb %al, %cl
+; X86-O0-NEXT:    # implicit-def: $eax
+; X86-O0-NEXT:    movb %cl, (%eax)
+; X86-O0-NEXT:    movl %ebp, %esp
+; X86-O0-NEXT:    popl %ebp
+; X86-O0-NEXT:    .cfi_def_cfa %esp, 4
+; X86-O0-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %bb
@@ -80,32 +80,32 @@ define void @foo() {
 ; X64-NEXT:    movb %al, (%rax)
 ; X64-NEXT:    retq
 ;
-; 686-LABEL: foo:
-; 686:       # %bb.0: # %bb
-; 686-NEXT:    pushl %ebp
-; 686-NEXT:    .cfi_def_cfa_offset 8
-; 686-NEXT:    .cfi_offset %ebp, -8
-; 686-NEXT:    movl %esp, %ebp
-; 686-NEXT:    .cfi_def_cfa_register %ebp
-; 686-NEXT:    andl $-8, %esp
-; 686-NEXT:    subl $8, %esp
-; 686-NEXT:    movzbl var_27, %ecx
-; 686-NEXT:    movzwl var_22, %eax
-; 686-NEXT:    movl %eax, (%esp)
-; 686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; 686-NEXT:    addb $30, %cl
-; 686-NEXT:    xorl %edx, %edx
-; 686-NEXT:    shrdl %cl, %edx, %eax
-; 686-NEXT:    testb $32, %cl
-; 686-NEXT:    jne .LBB0_2
-; 686-NEXT:  # %bb.1: # %bb
-; 686-NEXT:    movl %eax, %edx
-; 686-NEXT:  .LBB0_2: # %bb
-; 686-NEXT:    movb %dl, (%eax)
-; 686-NEXT:    movl %ebp, %esp
-; 686-NEXT:    popl %ebp
-; 686-NEXT:    .cfi_def_cfa %esp, 4
-; 686-NEXT:    retl
+; X86-LABEL: foo:
+; X86:       # %bb.0: # %bb
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzbl var_27, %ecx
+; X86-NEXT:    movzwl var_22, %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    addb $30, %cl
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB0_2
+; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB0_2: # %bb
+; X86-NEXT:    movb %dl, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
 bb:
   %tmp = alloca i64, align 8
   %tmp1 = load i16, i16* @var_22, align 2
diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index f00788145f4e0a..404d49b63f25c9 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW
 
 ; Rules:
 ; 3dnow by itself get you just the single prefetch instruction with no hints
@@ -21,68 +21,68 @@
 
 ; rdar://10538297
 
-define void @t(i8* %ptr) nounwind  {
-; SSE-LABEL: t:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    prefetcht2 (%eax)
-; SSE-NEXT:    prefetcht1 (%eax)
-; SSE-NEXT:    prefetcht0 (%eax)
-; SSE-NEXT:    prefetchnta (%eax)
-; SSE-NEXT:    prefetcht2 (%eax)
-; SSE-NEXT:    prefetcht1 (%eax)
-; SSE-NEXT:    prefetcht0 (%eax)
-; SSE-NEXT:    prefetchnta (%eax)
-; SSE-NEXT:    retl
+define void @t(ptr %ptr) nounwind  {
+; X86-SSE-LABEL: t:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    prefetcht2 (%eax)
+; X86-SSE-NEXT:    prefetcht1 (%eax)
+; X86-SSE-NEXT:    prefetcht0 (%eax)
+; X86-SSE-NEXT:    prefetchnta (%eax)
+; X86-SSE-NEXT:    prefetcht2 (%eax)
+; X86-SSE-NEXT:    prefetcht1 (%eax)
+; X86-SSE-NEXT:    prefetcht0 (%eax)
+; X86-SSE-NEXT:    prefetchnta (%eax)
+; X86-SSE-NEXT:    retl
 ;
-; PRFCHWSSE-LABEL: t:
-; PRFCHWSSE:       # %bb.0: # %entry
-; PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; PRFCHWSSE-NEXT:    prefetcht2 (%eax)
-; PRFCHWSSE-NEXT:    prefetcht1 (%eax)
-; PRFCHWSSE-NEXT:    prefetcht0 (%eax)
-; PRFCHWSSE-NEXT:    prefetchnta (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    retl
+; X86-PRFCHWSSE-LABEL: t:
+; X86-PRFCHWSSE:       # %bb.0: # %entry
+; X86-PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-PRFCHWSSE-NEXT:    prefetcht2 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetcht1 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetcht0 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchnta (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    retl
 ;
-; PREFETCHWT1-LABEL: t:
-; PREFETCHWT1:       # %bb.0: # %entry
-; PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; PREFETCHWT1-NEXT:    prefetcht2 (%eax)
-; PREFETCHWT1-NEXT:    prefetcht1 (%eax)
-; PREFETCHWT1-NEXT:    prefetcht0 (%eax)
-; PREFETCHWT1-NEXT:    prefetchnta (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    prefetchw (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    retl
+; X86-PREFETCHWT1-LABEL: t:
+; X86-PREFETCHWT1:       # %bb.0: # %entry
+; X86-PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-PREFETCHWT1-NEXT:    prefetcht2 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetcht1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetcht0 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchnta (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchw (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    retl
 ;
-; 3DNOW-LABEL: t:
-; 3DNOW:       # %bb.0: # %entry
-; 3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    retl
+; X86-3DNOW-LABEL: t:
+; X86-3DNOW:       # %bb.0: # %entry
+; X86-3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    retl
 entry:
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 1, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 2, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 0, i32 1 )
-	ret void
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 1, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 2, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 3, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 0, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 1, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 2, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 3, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 0, i32 1 )
+  ret void
 }
 
-declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+declare void @llvm.prefetch(ptr, i32, i32, i32) nounwind
diff --git a/llvm/test/CodeGen/X86/sibcall-2.ll b/llvm/test/CodeGen/X86/sibcall-2.ll
index d2b78aae90b410..ca42fbafde14f7 100644
--- a/llvm/test/CodeGen/X86/sibcall-2.ll
+++ b/llvm/test/CodeGen/X86/sibcall-2.ll
@@ -1,52 +1,48 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -frame-pointer=all | FileCheck %s -check-prefix=32
-; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -frame-pointer=all | FileCheck %s -check-prefix=64
+; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -frame-pointer=all | FileCheck %s -check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -frame-pointer=all | FileCheck %s -check-prefix=X64
 
 ; Tail call should not use ebp / rbp after it's popped. Use esp / rsp.
 
-define void @t1(i8* nocapture %value) nounwind {
+define void @t1(ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t1:
-; 32: jmpl *4(%esp)
+; X86-LABEL: t1:
+; X86: jmpl *4(%esp)
 
-; 64-LABEL: t1:
-; 64: jmpq *%rdi
-  %0 = bitcast i8* %value to void ()*
-  tail call void %0() nounwind
+; X64-LABEL: t1:
+; X64: jmpq *%rdi
+  tail call void %value() nounwind
   ret void
 }
 
-define void @t2(i32 %a, i8* nocapture %value) nounwind {
+define void @t2(i32 %a, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t2:
-; 32: jmpl *8(%esp)
+; X86-LABEL: t2:
+; X86: jmpl *8(%esp)
 
-; 64-LABEL: t2:
-; 64: jmpq *%rsi
-  %0 = bitcast i8* %value to void ()*
-  tail call void %0() nounwind
+; X64-LABEL: t2:
+; X64: jmpq *%rsi
+  tail call void %value() nounwind
   ret void
 }
 
-define void @t3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i8* nocapture %value) nounwind {
+define void @t3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t3:
-; 32: jmpl *28(%esp)
+; X86-LABEL: t3:
+; X86: jmpl *28(%esp)
 
-; 64-LABEL: t3:
-; 64: jmpq *8(%rsp)
-  %0 = bitcast i8* %value to void ()*
-  tail call void %0() nounwind
+; X64-LABEL: t3:
+; X64: jmpq *8(%rsp)
+  tail call void %value() nounwind
   ret void
 }
 
-define void @t4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i8* nocapture %value) nounwind {
+define void @t4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t4:
-; 32: jmpl *32(%esp)
+; X86-LABEL: t4:
+; X86: jmpl *32(%esp)
 
-; 64-LABEL: t4:
-; 64: jmpq *16(%rsp)
-  %0 = bitcast i8* %value to void ()*
-  tail call void %0() nounwind
+; X64-LABEL: t4:
+; X64: jmpq *16(%rsp)
+  tail call void %value() nounwind
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/sibcall-byval.ll b/llvm/test/CodeGen/X86/sibcall-byval.ll
index 999e12334c6c36..0e06833ad70bb8 100644
--- a/llvm/test/CodeGen/X86/sibcall-byval.ll
+++ b/llvm/test/CodeGen/X86/sibcall-byval.ll
@@ -1,31 +1,31 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9   | FileCheck %s -check-prefix=32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=64
+; RUN: llc < %s -mtriple=i386-apple-darwin9   | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
 
 %struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 
-define i32 @f(%struct.p* byval(%struct.p) align 4 %q) nounwind ssp {
+define i32 @f(ptr byval(%struct.p) align 4 %q) nounwind ssp {
 entry:
-; 32: _f:
-; 32: jmp _g
+; X86: _f:
+; X86: jmp _g
 
-; 64: _f:
-; 64: jmp _g
-  %call = tail call i32 @g(%struct.p* byval(%struct.p) align 4 %q) nounwind
+; X64: _f:
+; X64: jmp _g
+  %call = tail call i32 @g(ptr byval(%struct.p) align 4 %q) nounwind
   ret i32 %call
 }
 
-declare i32 @g(%struct.p* byval(%struct.p) align 4)
+declare i32 @g(ptr byval(%struct.p) align 4)
 
-define i32 @h(%struct.p* byval(%struct.p) align 4 %q, i32 %r) nounwind ssp {
+define i32 @h(ptr byval(%struct.p) align 4 %q, i32 %r) nounwind ssp {
 entry:
-; 32: _h:
-; 32: jmp _i
+; X86: _h:
+; X86: jmp _i
 
-; 64: _h:
-; 64: jmp _i
+; X64: _h:
+; X64: jmp _i
 
-  %call = tail call i32 @i(%struct.p* byval(%struct.p) align 4 %q, i32 %r) nounwind
+  %call = tail call i32 @i(ptr byval(%struct.p) align 4 %q, i32 %r) nounwind
   ret i32 %call
 }
 
-declare i32 @i(%struct.p* byval(%struct.p) align 4, i32)
+declare i32 @i(ptr byval(%struct.p) align 4, i32)
diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll
index 1697b7db2b1675..9c0f0dab1ab520 100644
--- a/llvm/test/CodeGen/X86/x32-va_start.ll
+++ b/llvm/test/CodeGen/X86/x32-va_start.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=-sse | FileCheck %s -check-prefix=NOSSE
-; RUN: llc < %s -mtriple=i386-linux-gnux32 | FileCheck %s -check-prefix=32BITABI
-; RUN: llc < %s -mtriple=i686-linux-gnux32 | FileCheck %s -check-prefix=32BITABI
+; RUN: llc < %s -mtriple=i386-linux-gnux32 | FileCheck %s -check-prefix=X32BITABI
+; RUN: llc < %s -mtriple=i686-linux-gnux32 | FileCheck %s -check-prefix=X32BITABI
 ;
 ; Verifies that x32 va_start lowering is sane. To regenerate this test, use
 ; cat <<EOF |
@@ -97,27 +97,27 @@ define i32 @foo(float %a, i8* nocapture readnone %fmt, ...) nounwind {
 ; NOSSE-NEXT:    movl (%eax), %eax
 ; NOSSE-NEXT:    retq
 ;
-; 32BITABI-LABEL: foo:
-; 32BITABI:       # %bb.0: # %entry
-; 32BITABI-NEXT:    subl $28, %esp
-; 32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; 32BITABI-NEXT:    movl %ecx, (%esp)
-; 32BITABI-NEXT:    cmpl $40, %ecx
-; 32BITABI-NEXT:    ja .LBB0_2
-; 32BITABI-NEXT:  # %bb.1: # %vaarg.in_reg
-; 32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32BITABI-NEXT:    addl %ecx, %eax
-; 32BITABI-NEXT:    addl $8, %ecx
-; 32BITABI-NEXT:    movl %ecx, (%esp)
-; 32BITABI-NEXT:    jmp .LBB0_3
-; 32BITABI-NEXT:  .LBB0_2: # %vaarg.in_mem
-; 32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32BITABI-NEXT:    leal 8(%eax), %ecx
-; 32BITABI-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; 32BITABI-NEXT:  .LBB0_3: # %vaarg.end
-; 32BITABI-NEXT:    movl (%eax), %eax
-; 32BITABI-NEXT:    addl $28, %esp
-; 32BITABI-NEXT:    retl
+; X32BITABI-LABEL: foo:
+; X32BITABI:       # %bb.0: # %entry
+; X32BITABI-NEXT:    subl $28, %esp
+; X32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X32BITABI-NEXT:    movl %ecx, (%esp)
+; X32BITABI-NEXT:    cmpl $40, %ecx
+; X32BITABI-NEXT:    ja .LBB0_2
+; X32BITABI-NEXT:  # %bb.1: # %vaarg.in_reg
+; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32BITABI-NEXT:    addl %ecx, %eax
+; X32BITABI-NEXT:    addl $8, %ecx
+; X32BITABI-NEXT:    movl %ecx, (%esp)
+; X32BITABI-NEXT:    jmp .LBB0_3
+; X32BITABI-NEXT:  .LBB0_2: # %vaarg.in_mem
+; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32BITABI-NEXT:    leal 8(%eax), %ecx
+; X32BITABI-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32BITABI-NEXT:  .LBB0_3: # %vaarg.end
+; X32BITABI-NEXT:    movl (%eax), %eax
+; X32BITABI-NEXT:    addl $28, %esp
+; X32BITABI-NEXT:    retl
 entry:
   %ap = alloca [1 x %struct.__va_list_tag], align 16
   %0 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
diff --git a/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml b/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
index 191c49042be6b9..4cd3034fe40ab6 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
@@ -19,11 +19,11 @@ TpiStream:
   # (int, char **)                                  [Index: 0x1003]
   - Kind:            LF_ARGLIST
     ArgList:
-      ArgIndicies:  [ 116, 0x1002 ]
+      ArgIndices:  [ 116, 0x1002 ]
   # (int, double)                                   [Index: 0x1004]
   - Kind:            LF_ARGLIST
     ArgList:
-      ArgIndicies:  [ 116, 65 ]       # (int, double)
+      ArgIndices:  [ 116, 65 ]       # (int, double)
   # int main(int argc, char **argv)                 [Index: 0x1005]
   - Kind:            LF_PROCEDURE
     Procedure:
diff --git a/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s b/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
index 2dc2c8045a65ca..0158035c682c70 100644
--- a/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
+++ b/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
@@ -65,10 +65,10 @@ multiple_personality:
 @ CHECK: 	.personalityindex 0
 @ CHECK:       ^
 
-	.global multiple_personality_indicies
-	.type multiple_personality_indicies,%function
+	.global multiple_personality_indices
+	.type multiple_personality_indices,%function
 	.thumb_func
-multiple_personality_indicies:
+multiple_personality_indices:
 	.fnstart
 	.personalityindex 0
 	.personalityindex 1
diff --git a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
index 35f110f37e2fb6..9d9a38f5b5a54b 100644
--- a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
+++ b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
@@ -1,13 +1,17 @@
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.s -filetype=asm %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.o -filetype=obj %s 2>&1 | FileCheck %s
-
-; Assembler-aware expression evaluation should be disabled in inline
-; assembly to prevent differences in behavior between object and
-; assembly output.
+; RUN: not llc -mtriple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64 -no-integrated-as < %s | FileCheck %s --check-prefix=GAS
+; RUN: llc -mtriple=x86_64 -filetype=obj %s -o - | llvm-objdump -d - |  FileCheck %s --check-prefix=DISASM
 
+; GAS: nop;  .if . - foo==1; nop;.endif
 
 ; CHECK: <inline asm>:1:17: error: expected absolute expression
 
+; DISASM:      <main>:
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   xorl %eax, %eax
+; DISASM-NEXT:   retq
+
 define i32 @main() local_unnamed_addr {
   tail call void asm sideeffect "foo: nop;  .if . - foo==1;  nop;.endif", "~{dirflag},~{fpsr},~{flags}"()
   ret i32 0
diff --git a/llvm/test/MC/X86/abs8.s b/llvm/test/MC/X86/abs8.s
index b933c9ddff9032..71936acb37408a 100644
--- a/llvm/test/MC/X86/abs8.s
+++ b/llvm/test/MC/X86/abs8.s
@@ -1,8 +1,8 @@
-// RUN: llvm-mc -filetype=obj %s -o - -triple i686-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=32 %s
-// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=64 %s
+// RUN: llvm-mc -filetype=obj %s -o - -triple i686-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=X86 %s
+// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=X64 %s
 
-// 32: 0: 83 ff 00  cmpl $0, %edi
-// 32:   00000002:  R_386_8 foo
-// 64: 0: 83 ff 00  cmpl $0, %edi
-// 64:  0000000000000002:  R_X86_64_8 foo
+// X86: 0: 83 ff 00  cmpl $0, %edi
+// X86:   00000002:  R_386_8 foo
+// X64: 0: 83 ff 00  cmpl $0, %edi
+// X64:  0000000000000002:  R_X86_64_8 foo
 cmp $foo@ABS8, %edi
diff --git a/llvm/test/MC/X86/align-branch-variant-symbol.s b/llvm/test/MC/X86/align-branch-variant-symbol.s
index 53afdf58bff39e..a1b7b895a354c5 100644
--- a/llvm/test/MC/X86/align-branch-variant-symbol.s
+++ b/llvm/test/MC/X86/align-branch-variant-symbol.s
@@ -1,6 +1,5 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=64BIT,CHECK
-
-# RUN: llvm-mc -filetype=obj -triple i386 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=32BIT,CHECK
+# RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=CHECK,X64
+# RUN: llvm-mc -filetype=obj -triple i386 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=CHECK,X86
 
   # Exercise cases where the instruction to be aligned has a variant symbol
   # operand, and we can't add before it since linker may rewrite it.
@@ -14,8 +13,8 @@ foo:
   int3
   .endr
   # CHECK:    1d:          int3
-  # 64BIT:    1e:          callq
-  # 32BIT:    1e:          calll
+  # X64:    1e:          callq
+  # X86:    1e:          calll
   # CHECK:    23:          int3
   call    ___tls_get_addr@PLT
   int3
@@ -25,10 +24,10 @@ foo:
   int3
   .endr
   # CHECK:    5d:          int3
-  # 64BIT:    5e:          callq    *(%ecx)
-  # 64BIT:    65:          int3
-  # 32BIT:    5e:          calll    *(%ecx)
-  # 32BIT:    64:          int3
+  # X64:    5e:          callq    *(%ecx)
+  # X64:    65:          int3
+  # X86:    5e:          calll    *(%ecx)
+  # X86:    64:          int3
   call *___tls_get_addr@GOT(%ecx)
   int3
 
@@ -37,10 +36,10 @@ foo:
   int3
   .endr
   # CHECK:    9d:          int3
-  # 64BIT:    9e:          callq    *(%eax)
-  # 64BIT:    a1:          int3
-  # 32BIT:    9e:          calll    *(%eax)
-  # 32BIT:    a0:          int3
+  # X64:    9e:          callq    *(%eax)
+  # X64:    a1:          int3
+  # X86:    9e:          calll    *(%eax)
+  # X86:    a0:          int3
   call *foo@tlscall(%eax)
   int3
 
@@ -49,9 +48,9 @@ foo:
   int3
   .endr
   # CHECK:    dd:          int3
-  # 64BIT:    de:          jmpq    *(%eax)
-  # 64BIT:    e1:          int3
-  # 32BIT:    de:          jmpl    *(%eax)
-  # 32BIT:    e0:          int3
+  # X64:    de:          jmpq    *(%eax)
+  # X64:    e1:          int3
+  # X86:    de:          jmpl    *(%eax)
+  # X86:    e0:          int3
   jmp  *foo@tlscall(%eax)
   int3
diff --git a/llvm/test/MC/X86/data-prefix-fail.s b/llvm/test/MC/X86/data-prefix-fail.s
index bd5b62ddc9bedf..3088864fc909b0 100644
--- a/llvm/test/MC/X86/data-prefix-fail.s
+++ b/llvm/test/MC/X86/data-prefix-fail.s
@@ -1,30 +1,30 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --check-prefix=ERR64 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 // ERR64: error: 'data32' is not supported in 64-bit mode
 // ERR32: error: redundant data32 prefix
-// 16: lgdtl 0
-// 16-SAME: encoding: [0x66,0x0f,0x01,0x16,0x00,0x00]
+// X86-16: lgdtl 0
+// X86-16-SAME: encoding: [0x66,0x0f,0x01,0x16,0x00,0x00]
 data32 lgdt 0
 
-// 64: data16
-// 64: encoding: [0x66]
-// 64: lgdtq 0
-// 64: encoding: [0x0f,0x01,0x14,0x25,0x00,0x00,0x00,0x00]
-// 32: data16
-// 32: encoding: [0x66]
-// 32: lgdtl 0
-// 32: encoding: [0x0f,0x01,0x15,0x00,0x00,0x00,0x00]
+// X86-64: data16
+// X86-64: encoding: [0x66]
+// X86-64: lgdtq 0
+// X86-64: encoding: [0x0f,0x01,0x14,0x25,0x00,0x00,0x00,0x00]
+// X86-32: data16
+// X86-32: encoding: [0x66]
+// X86-32: lgdtl 0
+// X86-32: encoding: [0x0f,0x01,0x15,0x00,0x00,0x00,0x00]
 // ERR16: error: redundant data16 prefix
 data16 lgdt 0
 
-// 64:      data16    # encoding: [0x66]
-// 64-NEXT: callq  0  # encoding: [0xe8,A,A,A,A]
-// 32:      data16    # encoding: [0x66]
-// 32-NEXT: calll  0  # encoding: [0xe8,A,A,A,A]
+// X86-64:      data16    # encoding: [0x66]
+// X86-64-NEXT: callq  0  # encoding: [0xe8,A,A,A,A]
+// X86-32:      data16    # encoding: [0x66]
+// X86-32-NEXT: calll  0  # encoding: [0xe8,A,A,A,A]
 // ERR16: {{.*}}.s:[[#@LINE+1]]:1: error: redundant data16 prefix
 data16 call 0
diff --git a/llvm/test/MC/X86/displacement-overflow.s b/llvm/test/MC/X86/displacement-overflow.s
index 2882147af48280..933f98cb21fcb5 100644
--- a/llvm/test/MC/X86/displacement-overflow.s
+++ b/llvm/test/MC/X86/displacement-overflow.s
@@ -1,14 +1,14 @@
-# RUN: not llvm-mc -triple=x86_64 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,64 --implicit-check-not=error: --implicit-check-not=warning:
-# RUN: llvm-mc -triple=i686 --defsym A16=1 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,32 --implicit-check-not=error: --implicit-check-not=warning:
+# RUN: not llvm-mc -triple=x86_64 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,X64 --implicit-check-not=error: --implicit-check-not=warning:
+# RUN: llvm-mc -triple=i686 --defsym A16=1 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,X86 --implicit-check-not=error: --implicit-check-not=warning:
 
 .ifndef A16
 movq 0x80000000-1(%rip), %rax
 leaq -0x80000000(%rip), %rax
 
-# 64: [[#@LINE+1]]:17: error: displacement 2147483648 is not within [-2147483648, 2147483647]
+# X64: [[#@LINE+1]]:17: error: displacement 2147483648 is not within [-2147483648, 2147483647]
 movq 0x80000000(%rip), %rax
 
-# 64: [[#@LINE+1]]:18: error: displacement -2147483649 is not within [-2147483648, 2147483647]
+# X64: [[#@LINE+1]]:18: error: displacement -2147483649 is not within [-2147483648, 2147483647]
 leaq -0x80000001(%rip), %rax
 .endif
 
@@ -31,8 +31,8 @@ leal -0xffffffff-2(%eax), %eax
 movw $0, 0xffff(%bp)
 movw $0, -0xffff(%si)
 
-# 32: [[#@LINE+1]]:19: warning: displacement 65536 shortened to 16-bit signed 0
+# X86: [[#@LINE+1]]:19: warning: displacement 65536 shortened to 16-bit signed 0
 movw $0, 0xffff+1(%bp)
-# 32: [[#@LINE+1]]:20: warning: displacement -65536 shortened to 16-bit signed 0
+# X86: [[#@LINE+1]]:20: warning: displacement -65536 shortened to 16-bit signed 0
 movw $0, -0xffff-1(%si)
 .endif
diff --git a/llvm/test/MC/X86/dwarf-segment-register.s b/llvm/test/MC/X86/dwarf-segment-register.s
index a6857680747064..5482588df82123 100644
--- a/llvm/test/MC/X86/dwarf-segment-register.s
+++ b/llvm/test/MC/X86/dwarf-segment-register.s
@@ -1,7 +1,7 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.64
-// RUN: llvm-objdump --dwarf=frames %t.64 | FileCheck %s --check-prefixes=64,CHECK
+// RUN: llvm-objdump --dwarf=frames %t.64 | FileCheck %s --check-prefixes=X64,CHECK
 // RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o %t.32
-// RUN: llvm-objdump --dwarf=frames %t.32 | FileCheck %s --check-prefixes=32,CHECK
+// RUN: llvm-objdump --dwarf=frames %t.32 | FileCheck %s --check-prefixes=X86,CHECK
 
 .cfi_startproc
 .cfi_offset %cs, -40
@@ -12,26 +12,26 @@
 .cfi_offset %gs, 0
 .cfi_endproc
 
-// 64: reg51
-// 32: reg41
+// X64: reg51
+// X86: reg41
 // CHECK-SAME: -40
 
-// 64: reg53
-// 32: reg43
+// X64: reg53
+// X86: reg43
 // CHECK-SAME: -32
 
-// 64: reg52
-// 32: reg42
+// X64: reg52
+// X86: reg42
 // CHECK-SAME: -24
 
-// 64: reg50
-// 32: reg40
+// X64: reg50
+// X86: reg40
 // CHECK-SAME: -16
 
-// 64: reg54
-// 32: reg44
+// X64: reg54
+// X86: reg44
 // CHECK-SAME: -8
 
-// 64: reg55
-// 32: reg45
+// X64: reg55
+// X86: reg45
 // CHECK-SAME: 0
diff --git a/llvm/test/MC/X86/index-operations.s b/llvm/test/MC/X86/index-operations.s
index 59498b0ced1249..c425ba8057c5f2 100644
--- a/llvm/test/MC/X86/index-operations.s
+++ b/llvm/test/MC/X86/index-operations.s
@@ -1,34 +1,34 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --input-file=%t.err %s --check-prefix=ERR64 --implicit-check-not=error:
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 lodsb
-// 64: lodsb (%rsi), %al # encoding: [0xac]
-// 32: lodsb (%esi), %al # encoding: [0xac]
-// 16: lodsb (%si), %al # encoding: [0xac]
+// X86-64: lodsb (%rsi), %al # encoding: [0xac]
+// X86-32: lodsb (%esi), %al # encoding: [0xac]
+// X86-16: lodsb (%si), %al # encoding: [0xac]
 
 lodsb (%rsi), %al
-// 64: lodsb (%rsi), %al # encoding: [0xac]
+// X86-64: lodsb (%rsi), %al # encoding: [0xac]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 lodsb (%esi), %al
-// 64: lodsb (%esi), %al # encoding: [0x67,0xac]
-// 32: lodsb (%esi), %al # encoding: [0xac]
-// 16: lodsb (%esi), %al # encoding: [0x67,0xac]
+// X86-64: lodsb (%esi), %al # encoding: [0x67,0xac]
+// X86-32: lodsb (%esi), %al # encoding: [0xac]
+// X86-16: lodsb (%esi), %al # encoding: [0x67,0xac]
 
 lodsb (%si), %al
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 32: lodsb (%si), %al # encoding: [0x67,0xac]
-// 16: lodsb (%si), %al # encoding: [0xac]
+// X86-32: lodsb (%si), %al # encoding: [0x67,0xac]
+// X86-16: lodsb (%si), %al # encoding: [0xac]
 
 lodsl %gs:(%esi)
-// 64: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0xad]
-// 32: lodsl %gs:(%esi), %eax # encoding: [0x65,0xad]
-// 16: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0x66,0xad]
+// X86-64: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0xad]
+// X86-32: lodsl %gs:(%esi), %eax # encoding: [0x65,0xad]
+// X86-16: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0x66,0xad]
 
 lodsl (%edi), %eax
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
@@ -41,19 +41,19 @@ lodsl 44(%edi), %eax
 // ERR16: invalid operand
 
 lods (%esi), %ax
-// 64: lodsw (%esi), %ax # encoding: [0x67,0x66,0xad]
-// 32: lodsw (%esi), %ax # encoding: [0x66,0xad]
-// 16: lodsw (%esi), %ax # encoding: [0x67,0xad]
+// X86-64: lodsw (%esi), %ax # encoding: [0x67,0x66,0xad]
+// X86-32: lodsw (%esi), %ax # encoding: [0x66,0xad]
+// X86-16: lodsw (%esi), %ax # encoding: [0x67,0xad]
 
 stosw
-// 64: stosw %ax, %es:(%rdi) # encoding: [0x66,0xab]
-// 32: stosw %ax, %es:(%edi) # encoding: [0x66,0xab]
-// 16: stosw %ax, %es:(%di) # encoding: [0xab]
+// X86-64: stosw %ax, %es:(%rdi) # encoding: [0x66,0xab]
+// X86-32: stosw %ax, %es:(%edi) # encoding: [0x66,0xab]
+// X86-16: stosw %ax, %es:(%di) # encoding: [0xab]
 
 stos %eax, (%edi)
-// 64: stosl %eax, %es:(%edi) # encoding: [0x67,0xab]
-// 32: stosl %eax, %es:(%edi) # encoding: [0xab]
-// 16: stosl %eax, %es:(%edi) # encoding: [0x67,0x66,0xab]
+// X86-64: stosl %eax, %es:(%edi) # encoding: [0x67,0xab]
+// X86-32: stosl %eax, %es:(%edi) # encoding: [0xab]
+// X86-16: stosl %eax, %es:(%edi) # encoding: [0x67,0x66,0xab]
 
 stosb %al, %fs:(%edi)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand for instruction
@@ -61,27 +61,27 @@ stosb %al, %fs:(%edi)
 // ERR16: invalid operand for instruction
 
 stosb %al, %es:(%edi)
-// 64: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
-// 32: stosb %al, %es:(%edi) # encoding: [0xaa]
-// 16: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
+// X86-64: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
+// X86-32: stosb %al, %es:(%edi) # encoding: [0xaa]
+// X86-16: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
 
 stosq
-// 64: stosq %rax, %es:(%rdi) # encoding: [0x48,0xab]
+// X86-64: stosq %rax, %es:(%rdi) # encoding: [0x48,0xab]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 stos %rax, (%edi)
-// 64: 	stosq %rax, %es:(%edi) # encoding: [0x67,0x48,0xab]
+// X86-64: 	stosq %rax, %es:(%edi) # encoding: [0x67,0x48,0xab]
 // ERR32: only available in 64-bit mode
 // ERR16: only available in 64-bit mode
 
 scas %es:(%edi), %al
-// 64: scasb %es:(%edi), %al # encoding: [0x67,0xae]
-// 32: scasb %es:(%edi), %al # encoding: [0xae]
-// 16: scasb %es:(%edi), %al # encoding: [0x67,0xae]
+// X86-64: scasb %es:(%edi), %al # encoding: [0x67,0xae]
+// X86-32: scasb %es:(%edi), %al # encoding: [0xae]
+// X86-16: scasb %es:(%edi), %al # encoding: [0x67,0xae]
 
 scasq %es:(%edi)
-// 64: scasq %es:(%edi), %rax # encoding: [0x67,0x48,0xaf]
+// X86-64: scasq %es:(%edi), %rax # encoding: [0x67,0x48,0xaf]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
@@ -92,18 +92,18 @@ scasl %es:(%edi), %al
 
 scas %es:(%di), %ax
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 16: scasw %es:(%di), %ax # encoding: [0xaf]
-// 32: scasw %es:(%di), %ax # encoding: [0x67,0x66,0xaf]
+// X86-16: scasw %es:(%di), %ax # encoding: [0xaf]
+// X86-32: scasw %es:(%di), %ax # encoding: [0x67,0x66,0xaf]
 
 cmpsb
-// 64: cmpsb %es:(%rdi), (%rsi) # encoding: [0xa6]
-// 32: cmpsb %es:(%edi), (%esi) # encoding: [0xa6]
-// 16: cmpsb %es:(%di), (%si) # encoding: [0xa6]
+// X86-64: cmpsb %es:(%rdi), (%rsi) # encoding: [0xa6]
+// X86-32: cmpsb %es:(%edi), (%esi) # encoding: [0xa6]
+// X86-16: cmpsb %es:(%di), (%si) # encoding: [0xa6]
 
 cmpsw (%edi), (%esi)
-// 64: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0x66,0xa7]
-// 32: cmpsw %es:(%edi), (%esi) # encoding: [0x66,0xa7]
-// 16: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0xa7]
+// X86-64: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0x66,0xa7]
+// X86-32: cmpsw %es:(%edi), (%esi) # encoding: [0x66,0xa7]
+// X86-16: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0xa7]
 
 cmpsb (%di), (%esi)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
@@ -111,52 +111,52 @@ cmpsb (%di), (%esi)
 // ERR16: mismatching source and destination
 
 cmpsl %es:(%edi), %ss:(%esi)
-// 64: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0xa7]
-// 32: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x36,0xa7]
-// 16: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0x66,0xa7]
+// X86-64: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0xa7]
+// X86-32: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x36,0xa7]
+// X86-16: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0x66,0xa7]
 
 cmpsq (%rdi), (%rsi)
-// 64: cmpsq %es:(%rdi), (%rsi) # encoding: [0x48,0xa7]
+// X86-64: cmpsq %es:(%rdi), (%rsi) # encoding: [0x48,0xa7]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 movsb (%esi), (%edi)
-// 64: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
-// 32: movsb (%esi), %es:(%edi) # encoding: [0xa4]
-// 16: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
+// X86-64: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
+// X86-32: movsb (%esi), %es:(%edi) # encoding: [0xa4]
+// X86-16: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
 
 movsl %gs:(%esi), (%edi)
-// 64: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0xa5]
-// 32: movsl %gs:(%esi), %es:(%edi) # encoding: [0x65,0xa5]
-// 16: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0x66,0xa5]
+// X86-64: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0xa5]
+// X86-32: movsl %gs:(%esi), %es:(%edi) # encoding: [0x65,0xa5]
+// X86-16: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0x66,0xa5]
 
 outsb
-// 64: outsb (%rsi), %dx # encoding: [0x6e]
-// 32: outsb (%esi), %dx # encoding: [0x6e]
-// 16: outsb (%si), %dx # encoding: [0x6e]
+// X86-64: outsb (%rsi), %dx # encoding: [0x6e]
+// X86-32: outsb (%esi), %dx # encoding: [0x6e]
+// X86-16: outsb (%si), %dx # encoding: [0x6e]
 
 outsw %fs:(%esi), %dx
-// 64: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x66,0x6f]
-// 32: outsw %fs:(%esi), %dx # encoding: [0x64,0x66,0x6f]
-// 16: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x6f]
+// X86-64: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x66,0x6f]
+// X86-32: outsw %fs:(%esi), %dx # encoding: [0x64,0x66,0x6f]
+// X86-16: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x6f]
 
 insw %dx, (%edi)
-// 64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
-// 32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
-// 16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
+// X86-64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
+// X86-32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
+// X86-16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
 
 insw %dx, (%bx)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 32: insw %dx, %es:(%di) # encoding: [0x67,0x66,0x6d]
-// 16: insw %dx, %es:(%di) # encoding: [0x6d]
+// X86-32: insw %dx, %es:(%di) # encoding: [0x67,0x66,0x6d]
+// X86-16: insw %dx, %es:(%di) # encoding: [0x6d]
 
 insw %dx, (%ebx)
-// 64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
-// 32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
-// 16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
+// X86-64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
+// X86-32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
+// X86-16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
 
 insw %dx, (%rbx)
-// 64: insw %dx, %es:(%rdi) # encoding: [0x66,0x6d]
+// X86-64: insw %dx, %es:(%rdi) # encoding: [0x66,0x6d]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
@@ -177,17 +177,17 @@ movdir64b (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 movdir64b (%eip), %ebx
-// 64: movdir64b (%eip), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: movdir64b (%eip), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 movdir64b (%rip), %rbx
-// 64: movdir64b (%rip), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: movdir64b (%rip), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 movdir64b 291(%esi, %eiz, 4), %ebx
-// 64: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 movdir64b 291(%rsi, %riz, 4), %rbx
-// 64: movdir64b 291(%rsi,%riz,4), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: movdir64b 291(%rsi,%riz,4), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmd	291(%si), %ecx
 // ERR64: error: invalid 16-bit base register
@@ -206,17 +206,17 @@ enqcmd (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 enqcmd (%eip), %ebx
-// 64: enqcmd (%eip), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmd (%eip), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmd (%rip), %rbx
-// 64: enqcmd (%rip), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmd (%rip), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmd 291(%esi, %eiz, 4), %ebx
-// 64: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmd 291(%rsi, %riz, 4), %rbx
-// 64: enqcmd 291(%rsi,%riz,4), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmd 291(%rsi,%riz,4), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmds	291(%si), %ecx
 // ERR64: error: invalid 16-bit base register
@@ -235,14 +235,14 @@ enqcmds (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 enqcmds (%eip), %ebx
-// 64: enqcmds (%eip), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmds (%eip), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmds (%rip), %rbx
-// 64: enqcmds (%rip), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmds (%rip), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmds 291(%esi, %eiz, 4), %ebx
-// 64: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmds 291(%rsi, %riz, 4), %rbx
-// 64: enqcmds 291(%rsi,%riz,4), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmds 291(%rsi,%riz,4), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
diff --git a/llvm/test/MC/X86/ret.s b/llvm/test/MC/X86/ret.s
index 142a4614ba4f1f..7b5bcd4ad990f4 100644
--- a/llvm/test/MC/X86/ret.s
+++ b/llvm/test/MC/X86/ret.s
@@ -1,129 +1,129 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --check-prefix=ERR64 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 	ret
-// 64: retq
-// 64: encoding: [0xc3]
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 	retw
-// 64: retw
-// 64: encoding: [0x66,0xc3]
-// 32: retw
-// 32: encoding: [0x66,0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retw
+// X86-64: encoding: [0x66,0xc3]
+// X86-32: retw
+// X86-32: encoding: [0x66,0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 	retl
 // ERR64: error: instruction requires: Not 64-bit mode
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retl
-// 16: encoding: [0x66,0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retl
+// X86-16: encoding: [0x66,0xc3]
 	retq
-// 64: retq
-// 64: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	ret $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 	retw $0
-// 64: retw $0
-// 64: encoding: [0x66,0xc2,0x00,0x00]
-// 32: retw $0
-// 32: encoding: [0x66,0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retw $0
+// X86-64: encoding: [0x66,0xc2,0x00,0x00]
+// X86-32: retw $0
+// X86-32: encoding: [0x66,0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 	retl $0
 // ERR64: error: instruction requires: Not 64-bit mode
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retl $0
-// 16: encoding: [0x66,0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retl $0
+// X86-16: encoding: [0x66,0xc2,0x00,0x00]
 	retq $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	retn
-// 64: retq
-// 64: encoding: [0xc3]
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 
   retn $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 
 	lret
-// 64: lretl
-// 64: encoding: [0xcb]
-// 32: lretl
-// 32: encoding: [0xcb]
-// 16: lretw
-// 16: encoding: [0xcb]
+// X86-64: lretl
+// X86-64: encoding: [0xcb]
+// X86-32: lretl
+// X86-32: encoding: [0xcb]
+// X86-16: lretw
+// X86-16: encoding: [0xcb]
 	lretw
-// 64: lretw
-// 64: encoding: [0x66,0xcb]
-// 32: lretw
-// 32: encoding: [0x66,0xcb]
-// 16: lretw
-// 16: encoding: [0xcb]
+// X86-64: lretw
+// X86-64: encoding: [0x66,0xcb]
+// X86-32: lretw
+// X86-32: encoding: [0x66,0xcb]
+// X86-16: lretw
+// X86-16: encoding: [0xcb]
 	lretl
-// 64: lretl
-// 64: encoding: [0xcb]
-// 32: lretl
-// 32: encoding: [0xcb]
-// 16: lretl
-// 16: encoding: [0x66,0xcb]
+// X86-64: lretl
+// X86-64: encoding: [0xcb]
+// X86-32: lretl
+// X86-32: encoding: [0xcb]
+// X86-16: lretl
+// X86-16: encoding: [0x66,0xcb]
 	lretq
-// 64: lretq
-// 64: encoding: [0x48,0xcb]
+// X86-64: lretq
+// X86-64: encoding: [0x48,0xcb]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	lret $0
-// 64: lretl $0
-// 64: encoding: [0xca,0x00,0x00]
-// 32: lretl $0
-// 32: encoding: [0xca,0x00,0x00]
-// 16: lretw $0
-// 16: encoding: [0xca,0x00,0x00]
+// X86-64: lretl $0
+// X86-64: encoding: [0xca,0x00,0x00]
+// X86-32: lretl $0
+// X86-32: encoding: [0xca,0x00,0x00]
+// X86-16: lretw $0
+// X86-16: encoding: [0xca,0x00,0x00]
 	lretw $0
-// 64: lretw $0
-// 64: encoding: [0x66,0xca,0x00,0x00]
-// 32: lretw $0
-// 32: encoding: [0x66,0xca,0x00,0x00]
-// 16: lretw $0
-// 16: encoding: [0xca,0x00,0x00]
+// X86-64: lretw $0
+// X86-64: encoding: [0x66,0xca,0x00,0x00]
+// X86-32: lretw $0
+// X86-32: encoding: [0x66,0xca,0x00,0x00]
+// X86-16: lretw $0
+// X86-16: encoding: [0xca,0x00,0x00]
 	lretl $0
-// 64: lretl $0
-// 64: encoding: [0xca,0x00,0x00]
-// 32: lretl $0
-// 32: encoding: [0xca,0x00,0x00]
-// 16: lretl $0
-// 16: encoding: [0x66,0xca,0x00,0x00]
+// X86-64: lretl $0
+// X86-64: encoding: [0xca,0x00,0x00]
+// X86-32: lretl $0
+// X86-32: encoding: [0xca,0x00,0x00]
+// X86-16: lretl $0
+// X86-16: encoding: [0x66,0xca,0x00,0x00]
 	lretq $0
-// 64: lretq $0
-// 64: encoding: [0x48,0xca,0x00,0x00]
+// X86-64: lretq $0
+// X86-64: encoding: [0x48,0xca,0x00,0x00]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
diff --git a/llvm/test/MC/X86/x86_errors.s b/llvm/test/MC/X86/x86_errors.s
index f1c7110fde1f1c..da8659f3621e36 100644
--- a/llvm/test/MC/X86/x86_errors.s
+++ b/llvm/test/MC/X86/x86_errors.s
@@ -1,122 +1,122 @@
 // RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2> %t.err
-// RUN: FileCheck --check-prefix=64 < %t.err %s
+// RUN: FileCheck --check-prefix=X64 < %t.err %s
 
 // RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err
-// RUN: FileCheck --check-prefix=32 < %t.err %s
+// RUN: FileCheck --check-prefix=X86 < %t.err %s
 // rdar://8204588
 
-// 64: error: ambiguous instructions require an explicit suffix (could be 'cmpb', 'cmpw', 'cmpl', or 'cmpq')
+// X64: error: ambiguous instructions require an explicit suffix (could be 'cmpb', 'cmpw', 'cmpl', or 'cmpq')
 cmp $0, 0(%eax)
 
-// 32: error: register %rax is only available in 64-bit mode
+// X86: error: register %rax is only available in 64-bit mode
 addl $0, 0(%rax)
 
-// 32: test.s:8:2: error: invalid instruction mnemonic 'movi'
+// X86: test.s:8:2: error: invalid instruction mnemonic 'movi'
 
 # 8 "test.s"
  movi $8,%eax
 
 movl 0(%rax), 0(%edx)  // error: invalid operand for instruction
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 sysexitq
 
 // rdar://10710167
-// 64: error: expected scale expression
+// X64: error: expected scale expression
 lea (%rsp, %rbp, $4), %rax
 
 // rdar://10423777
-// 64: error: base register is 64-bit, but index register is not
+// X64: error: base register is 64-bit, but index register is not
 movq (%rsi,%ecx),%xmm0
 
-// 64: error: invalid 16-bit base register
+// X64: error: invalid 16-bit base register
 movl %eax,(%bp,%si)
 
-// 32: error: scale factor in 16-bit address must be 1
+// X86: error: scale factor in 16-bit address must be 1
 movl %eax,(%bp,%si,2)
 
-// 32: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base register
 movl %eax,(%cx)
 
-// 32: error: invalid 16-bit base/index register combination
+// X86: error: invalid 16-bit base/index register combination
 movl %eax,(%bp,%bx)
 
-// 32: error: 16-bit memory operand may not include only index register
+// X86: error: 16-bit memory operand may not include only index register
 movl %eax,(,%bx)
 
-// 32: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
 outb al, 4
 
-// 32: error: invalid segment register
-// 64: error: invalid segment register
+// X86: error: invalid segment register
+// X64: error: invalid segment register
 movl %eax:0x00, %ebx
 
-// 32: error: invalid operand for instruction
-// 64: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
+// X64: error: invalid operand for instruction
 cmpps $-129, %xmm0, %xmm0
 
-// 32: error: invalid operand for instruction
-// 64: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
+// X64: error: invalid operand for instruction
 cmppd $256, %xmm0, %xmm0
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 jrcxz 1
 
-// 64: error: instruction requires: Not 64-bit mode
+// X64: error: instruction requires: Not 64-bit mode
 jcxz 1
 
-// 32: error: register %cr8 is only available in 64-bit mode
+// X86: error: register %cr8 is only available in 64-bit mode
 movl %edx, %cr8
 
-// 32: error: register %dr8 is only available in 64-bit mode
+// X86: error: register %dr8 is only available in 64-bit mode
 movl %edx, %dr8
 
-// 32: error: register %rip is only available in 64-bit mode
-// 64: error: %rip can only be used as a base register
+// X86: error: register %rip is only available in 64-bit mode
+// X64: error: %rip can only be used as a base register
 mov %rip, %rax
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: %rip is not allowed as an index register
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: %rip is not allowed as an index register
 mov (%rax,%rip), %rbx
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 ljmpq *(%eax)
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: invalid base+index expression
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: invalid base+index expression
 leaq (%rax,%rsp), %rax
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 leaq (%eax,%esp), %eax
 
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%si,%bp), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%di,%bp), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%si,%bx), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%di,%bx), %ax
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 mov (,%eip), %rbx
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 mov (%eip,%eax), %rbx
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: base register is 64-bit, but index register is not
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: base register is 64-bit, but index register is not
 mov (%rax,%eiz), %ebx
 
-// 32: error: register %riz is only available in 64-bit mode
-// 64: error: base register is 32-bit, but index register is not
+// X86: error: register %riz is only available in 64-bit mode
+// X64: error: base register is 32-bit, but index register is not
 mov (%eax,%riz), %ebx
 
 
@@ -128,68 +128,68 @@ v_gs  = %gs
 v_imm = 4
 $test = %ebx
 
-// 32: 7: error: expected register here
-// 64: 7: error: expected register here
+// X86: 7: error: expected register here
+// X64: 7: error: expected register here
 mov 4(4), %eax	
 
-// 32: 7: error: expected register here
-// 64: 7: error: expected register here
+// X86: 7: error: expected register here
+// X64: 7: error: expected register here
 mov 5(v_imm), %eax		
 	
-// 32: 7: error: invalid register name
-// 64: 7: error: invalid register name
+// X86: 7: error: invalid register name
+// X64: 7: error: invalid register name
 mov 6(%v_imm), %eax		
 	
-// 32: 8: warning: scale factor without index register is ignored
-// 64: 8: warning: scale factor without index register is ignored
+// X86: 8: warning: scale factor without index register is ignored
+// X64: 8: warning: scale factor without index register is ignored
 mov 7(,v_imm), %eax		
 
-// 64: 6: error: expected immediate expression
+// X64: 6: error: expected immediate expression
 mov $%eax, %ecx
 
-// 32: 6: error: expected immediate expression
-// 64: 6: error: expected immediate expression
+// X86: 6: error: expected immediate expression
+// X64: 6: error: expected immediate expression
 mov $v_eax, %ecx
 
-// 32: error: unexpected token in argument list
-// 64: error: unexpected token in argument list
+// X86: error: unexpected token in argument list
+// X64: error: unexpected token in argument list
 mov v_ecx(%eax), %ecx	
 
-// 32: 7: error: invalid operand for instruction
-// 64: 7: error: invalid operand for instruction
+// X86: 7: error: invalid operand for instruction
+// X64: 7: error: invalid operand for instruction
 addb (%dx), %al
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cqto
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cltq
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cmpxchg16b (%eax)
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex2} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex3} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {evex} vmovdqu %xmm0, %xmm0
 
-// 32: 12: error: immediate must be an integer in range [0, 15]
-// 64: 12: error: immediate must be an integer in range [0, 15]
+// X86: 12: error: immediate must be an integer in range [0, 15]
+// X64: 12: error: immediate must be an integer in range [0, 15]
 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 pbndkb
 
-// 32: error: register %r16d is only available in 64-bit mode
+// X86: error: register %r16d is only available in 64-bit mode
 movl %eax, %r16d
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
index fbc4a2c006f84e..fd3b7bd815300c 100644
--- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
@@ -58,14 +58,13 @@ suspend.cond:
 ; CHECK-NEXT:       to label %[[STEP2_CONT:[^ ]+]] unwind label %[[PAD]]
 step2:
   %save2 = call token @llvm.coro.save(ptr null)
-  %resume.handle = invoke ptr @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
+  invoke void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
     to label %step2.continue unwind label %pad
 
 ; CHECK:      [[STEP2_CONT]]:
 ; CHECK-NEXT:   %[[NEXT_RESUME:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0)
 ; CHECK-NEXT:   musttail call {{.*}} void %[[NEXT_RESUME]](ptr %[[NEXT_HDL]])
 step2.continue:
-  call void @llvm.coro.resume(ptr %resume.handle)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %ret [
     i8 0, label %step3
@@ -112,7 +111,7 @@ declare i1 @llvm.coro.alloc(token)
 declare ptr @llvm.coro.begin(token, ptr)
 declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr)
 declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr)
-declare ptr @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
 declare i1 @llvm.coro.end(ptr, i1, token)
 
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
index 0f574c4acc26e7..8d019e6954628b 100644
--- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
@@ -49,8 +49,7 @@ suspend.cond:
 ; CHECK-NEXT:   musttail call {{.*}} void %[[CONT]](ptr %[[NEXT_HDL]])
 step2:
   %save2 = call token @llvm.coro.save(ptr null)
-  %resume.handle = call ptr @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
-  call void @llvm.coro.resume(ptr %resume.handle)
+  call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %ret [
     i8 0, label %step3
@@ -89,7 +88,7 @@ declare i1 @llvm.coro.alloc(token)
 declare ptr @llvm.coro.begin(token, ptr)
 declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr)
 declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr)
-declare ptr @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
 declare i1 @llvm.coro.end(ptr, i1, token)
 
 declare noalias ptr @malloc(i32)
diff --git a/llvm/test/Transforms/Coroutines/coro-preserve-final.ll b/llvm/test/Transforms/Coroutines/coro-preserve-final.ll
deleted file mode 100644
index 16eeb84e7915ae..00000000000000
--- a/llvm/test/Transforms/Coroutines/coro-preserve-final.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-%"struct.std::__n4861::noop_coroutine_promise" = type { i8 }
-%struct.Promise = type { %"struct.std::__n4861::coroutine_handle" }
-%"struct.std::__n4861::coroutine_handle" = type { ptr }
-
-define dso_local ptr @_Z5Outerv() #1 {
-entry:
-  %__promise = alloca %struct.Promise, align 8
-  %0 = call token @llvm.coro.id(i32 16, ptr nonnull %__promise, ptr nonnull @_Z5Outerv, ptr null)
-  %1 = call i1 @llvm.coro.alloc(token %0)
-  br i1 %1, label %coro.alloc, label %init.suspend
-
-coro.alloc:                                       ; preds = %entry
-  %2 = tail call i64 @llvm.coro.size.i64()
-  %call = call noalias noundef nonnull ptr @_Znwm(i64 noundef %2) #12
-  br label %init.suspend
-
-init.suspend:                                     ; preds = %entry, %coro.alloc
-  %3 = phi ptr [ null, %entry ], [ %call, %coro.alloc ]
-  %4 = call ptr @llvm.coro.begin(token %0, ptr %3) #13
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %__promise) #3
-  store ptr null, ptr %__promise, align 8
-  %5 = call token @llvm.coro.save(ptr null)
-  %6 = call i8 @llvm.coro.suspend(token %5, i1 false)
-  switch i8 %6, label %coro.ret [
-  i8 0, label %await.suspend
-  i8 1, label %cleanup62
-  ]
-
-await.suspend:                                    ; preds = %init.suspend
-  %7 = call token @llvm.coro.save(ptr null)
-  %8 = call ptr @llvm.coro.subfn.addr(ptr %4, i8 0)
-  call fastcc void %8(ptr %4) #3
-  %9 = call i8 @llvm.coro.suspend(token %7, i1 false)
-  switch i8 %9, label %coro.ret [
-  i8 0, label %await2.suspend
-  i8 1, label %cleanup62
-  ]
-
-await2.suspend:                                   ; preds = %await.suspend
-  %call27 = call ptr @_Z5Innerv() #3
-  %10 = call token @llvm.coro.save(ptr null)
-  %11 = getelementptr inbounds i8, ptr %__promise, i64 -16
-  store ptr %11, ptr %call27, align 8
-  %12 = getelementptr inbounds i8, ptr %call27, i64 -16
-  %13 = call ptr @llvm.coro.subfn.addr(ptr nonnull %12, i8 0)
-  call fastcc void %13(ptr nonnull %12) #3
-  %14 = call i8 @llvm.coro.suspend(token %10, i1 false)
-  switch i8 %14, label %coro.ret [
-  i8 0, label %final.suspend
-  i8 1, label %cleanup62
-  ]
-
-final.suspend:                                    ; preds = %await2.suspend
-  %15 = call ptr @llvm.coro.subfn.addr(ptr nonnull %12, i8 1)
-  call fastcc void %15(ptr nonnull %12) #3
-  %16 = call token @llvm.coro.save(ptr null)
-  %retval.sroa.0.0.copyload.i = load ptr, ptr %__promise, align 8
-  %17 = call ptr @llvm.coro.subfn.addr(ptr %retval.sroa.0.0.copyload.i, i8 0)
-  call fastcc void %17(ptr %retval.sroa.0.0.copyload.i) #3
-  %18 = call i8 @llvm.coro.suspend(token %16, i1 true) #13
-  switch i8 %18, label %coro.ret [
-  i8 0, label %final.ready
-  i8 1, label %cleanup62
-  ]
-
-final.ready:                                      ; preds = %final.suspend
-  call void @_Z5_exiti(i32 noundef 1) #14
-  unreachable
-
-cleanup62:                                        ; preds = %await2.suspend, %await.suspend, %init.suspend, %final.suspend
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %__promise) #3
-  %19 = call ptr @llvm.coro.free(token %0, ptr %4)
-  %.not = icmp eq ptr %19, null
-  br i1 %.not, label %coro.ret, label %coro.free
-
-coro.free:                                        ; preds = %cleanup62
-  call void @_ZdlPv(ptr noundef nonnull %19) #3
-  br label %coro.ret
-
-coro.ret:                                         ; preds = %coro.free, %cleanup62, %final.suspend, %await2.suspend, %await.suspend, %init.suspend
-  %20 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #13
-  ret ptr %__promise
-}
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
-declare i1 @llvm.coro.alloc(token) #3
-declare dso_local noundef nonnull ptr @_Znwm(i64 noundef) local_unnamed_addr #4
-declare i64 @llvm.coro.size.i64() #5
-declare ptr @llvm.coro.begin(token, ptr writeonly) #3
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #6
-declare token @llvm.coro.save(ptr) #7
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #6
-declare i8 @llvm.coro.suspend(token, i1) #3
-declare dso_local ptr @_Z5Innerv() local_unnamed_addr #8
-declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #9
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
-declare i1 @llvm.coro.end(ptr, i1, token) #3
-declare dso_local void @_Z5_exiti(i32 noundef) local_unnamed_addr #10
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #11
-
-attributes #0 = { mustprogress nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nounwind presplitcoroutine uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #2 = { argmemonly nofree nounwind readonly }
-attributes #3 = { nounwind }
-attributes #4 = { nobuiltin allocsize(0) "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #5 = { nofree nosync nounwind readnone }
-attributes #6 = { argmemonly mustprogress nocallback nofree nosync nounwind willreturn }
-attributes #7 = { nomerge nounwind }
-attributes #8 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #9 = { nobuiltin nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #10 = { noreturn "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #11 = { argmemonly nounwind readonly }
-attributes #12 = { nounwind allocsize(0) }
-attributes #13 = { noduplicate }
-attributes #14 = { noreturn nounwind }
-
-; CHECK: define{{.*}}@_Z5Outerv.resume(
-; CHECK: entry.resume:
-; CHECK: switch i2 %index
-; CHECK-NEXT:    i2 0, label %await2.suspend
-; CHECK-NEXT:    i2 1, label %final.suspend
-;
-; CHECK: await2.suspend:
-; CHECK: musttail call
-; CHECK-NEXT: ret void
-;
-; CHECK: final.suspend:
-; CHECK: musttail call
-; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
index ddd293eed2409e..e2ed205f2c2f4f 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
@@ -90,10 +90,7 @@ define ptr @f(i32 %0) presplitcoroutine align 32 {
   %25 = getelementptr inbounds { ptr, ptr }, ptr %5, i64 0, i32 1
   store ptr %24, ptr %25, align 8
   %26 = call token @llvm.coro.save(ptr null)
-  %27 = call ptr @await_transform_await_suspend(ptr noundef nonnull align 8 dereferenceable(16) %5, ptr %14)
-  %28 = call ptr @llvm.coro.subfn.addr(ptr %27, i8 0)
-  %29 = ptrtoint ptr %28 to i64
-  call fastcc void %28(ptr %27) #9
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_transform_await_suspend)
   %30 = call i8 @llvm.coro.suspend(token %26, i1 false)
   switch i8 %30, label %60 [
     i8 0, label %31
@@ -123,9 +120,7 @@ define ptr @f(i32 %0) presplitcoroutine align 32 {
   br i1 %42, label %43, label %46
 
 43:                                               ; preds = %36
-  %44 = call ptr @llvm.coro.subfn.addr(ptr nonnull %14, i8 1)
-  %45 = ptrtoint ptr %44 to i64
-  call fastcc void %44(ptr nonnull %14) #9
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_transform_await_suspend)
   br label %47
 
 46:                                               ; preds = %36
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
index 825e44471db27a..70f29f4a9a4dc4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -20,8 +19,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -40,10 +38,8 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
-; PGO: call void @llvm.instrprof
-; PGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -57,6 +53,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
index d0d11fc4495e48..3edb8728d8550b 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -28,17 +27,14 @@ await.suspend:
   ]
 await.resume1:
   %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl, ptr @await_suspend_function)
   br label %final.suspend
 await.resume2:
   %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl2, ptr @await_suspend_function)
   br label %final.suspend
 await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   br label %final.suspend
 final.suspend:
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
@@ -63,18 +59,18 @@ unreach:
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 ; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr3]]
 ; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
-; PGO: musttail call fastcc void %[[addr4]](ptr null)
+; CHECK: call ptr @await_suspend_function
+; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr4]]
 ; CHECK-NEXT: ret void
 
 
@@ -93,6 +89,7 @@ declare ptr @malloc(i64)
 declare i8 @switch_result()
 declare ptr @g()
 declare ptr @h()
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
index 3e91b79c10f736..a55b3d16e2ded3 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
@@ -1,4 +1,4 @@
-; Tests that we would convert coro.resume to a musttail call if the target is
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call if the target is
 ; Wasm64 or Wasm32 with tail-call support.
 ; REQUIRES: webassembly-registered-target
 
@@ -25,8 +25,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -51,6 +50,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine "target-features"="+tail-call" }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
index 2f27f79480ab1b..ca1611e19b9f94 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
@@ -1,5 +1,4 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -8,11 +7,6 @@ entry:
   ret void;
 }
 
-define void @fakeresume2(ptr align 8)  {
-entry:
-  ret void;
-}
-
 define void @g() #0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
@@ -29,7 +23,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume2(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -47,7 +41,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK: musttail call fastcc void @fakeresume2(ptr align 8 null)
+; CHECK: call ptr @await_suspend_function
+; CHECK-NEXT: call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -61,6 +57,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
index 4778e3dcaf9957..84cdac17beebb1 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -26,17 +25,14 @@ await.suspend:
   ]
 await.resume1:
   %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl, ptr @await_suspend_function)
   br label %final.suspend
 await.resume2:
   %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl2, ptr @await_suspend_function)
   br label %final.suspend
 await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   br label %final.suspend
 final.suspend:
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
@@ -59,22 +55,21 @@ unreach:
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 ; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr3]]
 ; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
-; PGO: musttail call fastcc void %[[addr4]](ptr null)
+; CHECK: call ptr @await_suspend_function
+; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr4]]
 ; CHECK-NEXT: ret void
 
 
-
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
 declare i1 @llvm.coro.alloc(token) #2
 declare i64 @llvm.coro.size.i64() #3
@@ -89,6 +84,7 @@ declare ptr @malloc(i64)
 declare i8 @switch_result()
 declare ptr @g()
 declare ptr @h()
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
index 00ee422ce5863d..b647bd2e4a207f 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
@@ -1,5 +1,4 @@
-; Tests that coro-split will convert a call before coro.suspend to a musttail call
-; while the user of the coro.suspend is a icmpinst.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -24,7 +23,7 @@ entry:
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
 
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
   %switch = icmp ult i8 %suspend, 2
   br i1 %switch, label %cleanup, label %coro.end
@@ -44,7 +43,7 @@ coro.end:
 }
 
 ; CHECK-LABEL: @f.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(
+; CHECK:          musttail call fastcc void
 ; CHECK-NEXT:     ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -59,6 +58,7 @@ declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
 declare void @delete(ptr nonnull) #2
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
index 9afc79abbe88cd..7c1a13fd83cec5 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
@@ -1,5 +1,4 @@
-; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -22,7 +21,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -39,7 +38,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:          musttail call fastcc void
 ; CHECK-NEXT:     ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -56,6 +55,7 @@ declare ptr @malloc(i64)
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index d9dba92ec4eb7e..e05169a7291680 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -1,5 +1,5 @@
 ; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; to convert a coro.await.suspend.handle call to a musttail call.
 ; The difference between this and coro-split-musttail5.ll is that there is
 ; an extra bitcast instruction in the path, which makes it harder to
 ; optimize.
@@ -25,7 +25,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -42,7 +42,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:      musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 ; It has a cleanup bb.
@@ -63,7 +63,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -90,7 +90,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:      musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -108,6 +108,7 @@ declare void @delete(ptr nonnull) #2
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index d0d5005587bda6..8ceb0dda94f6ac 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -1,13 +1,11 @@
 ; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; to convert a coro.await.suspend.handle call to a musttail call.
 ; The difference between this and coro-split-musttail5.ll and coro-split-musttail6.ll
 ; is that this contains dead instruction generated during the transformation,
 ; which makes the optimization harder.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
-declare void @fakeresume1(ptr align 8)
-
 define i64 @g() #0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
@@ -25,7 +23,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
 
   ; These (non-trivially) dead instructions are in the way.
@@ -48,7 +46,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:         %[[FRAME:[0-9]+]] = call ptr @await_suspend_function(ptr null, ptr null)
+; CHECK:         %[[RESUMEADDR:[0-9]+]] = call ptr @llvm.coro.subfn.addr(ptr %[[FRAME]], i8 0)
+; CHECK:         musttail call fastcc void %[[RESUMEADDR]](ptr %[[FRAME]])
 ; CHECK-NEXT:    ret void
 
 ; It has a cleanup bb.
@@ -69,7 +69,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -96,7 +96,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:         %[[FRAME:[0-9]+]] = call ptr @await_suspend_function(ptr null, ptr null)
+; CHECK:         %[[RESUMEADDR:[0-9]+]] = call ptr @llvm.coro.subfn.addr(ptr %[[FRAME]], i8 0)
+; CHECK:         musttail call fastcc void %[[RESUMEADDR]](ptr %[[FRAME]])
 ; CHECK-NEXT:    ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -114,6 +116,7 @@ declare void @delete(ptr nonnull) #2
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/InstCombine/phi-extractvalue.ll b/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
index 75fd4718721cc4..26893c178f590f 100644
--- a/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
+++ b/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
@@ -131,7 +131,7 @@ end:
   ret i32 %r
 }
 
-; But the indicies must match
+; But the indices must match
 define i32 @test4({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
@@ -162,7 +162,7 @@ end:
   ret i32 %r
 }
 
-; More complex aggregates are fine, too, as long as indicies match.
+; More complex aggregates are fine, too, as long as indices match.
 define i32 @test5({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
@@ -192,7 +192,7 @@ end:
   ret i32 %r
 }
 
-; The indicies must fully match, on all levels.
+; The indices must fully match, on all levels.
 define i32 @test6({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
@@ -282,7 +282,7 @@ end:
 }
 
 ; Also, unlike PHI-of-insertvalues, here the base aggregates of extractvalue
-; can have different types, and just checking the indicies is not enough.
+; can have different types, and just checking the indices is not enough.
 define i32 @test9({ i32, i32 } %agg_left, { i32, { i32, i32 } } %agg_right, i1 %c) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
index 548fb3bc9ddb01..3596ef198303e1 100644
--- a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
+++ b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
@@ -192,7 +192,7 @@ end:
   ret { i32, i32 } %r
 }
 
-; But the indicies must match
+; But the indices must match
 define { i32, i32 } @test6({ i32, i32 } %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
@@ -223,7 +223,7 @@ end:
   ret { i32, i32 } %r
 }
 
-; More complex aggregates are fine, too, as long as indicies match.
+; More complex aggregates are fine, too, as long as indices match.
 define {{ i32, i32 }, { i32, i32 }} @test7({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
@@ -253,7 +253,7 @@ end:
   ret {{ i32, i32 }, { i32, i32 }} %r
 }
 
-; The indicies must fully match, on all levels.
+; The indices must fully match, on all levels.
 define {{ i32, i32 }, { i32, i32 }} @test8({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll
new file mode 100644
index 00000000000000..58a8c3d078f02a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll
@@ -0,0 +1,40 @@
+; RUN: opt -passes=loop-vectorize %s -force-vector-width=1 -force-vector-interleave=2 -S -o - | FileCheck %s
+
+define void @foo(ptr addrspace(1) %in) {
+entry:
+  br label %loop
+
+loop:
+  %iter = phi i64 [ %next, %loop ], [ 0, %entry ]
+  %ascast = addrspacecast ptr addrspace(1) %in to ptr
+  %next = add i64 %iter, 1
+  %arrayidx = getelementptr inbounds i64, ptr %ascast, i64 %next
+  store i64 %next, ptr %arrayidx, align 4
+
+; check that we find the two interleaved blocks with ascast, gep and store:
+; CHECK: pred.store.if:
+; CHECK: [[ID1:%.*]] = add i64 %{{.*}}, 1
+; CHECK: [[AS1:%.*]] = addrspacecast ptr addrspace(1) %{{.*}} to ptr
+; CHECK: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[AS1]], i64 [[ID1]]
+; CHECK: store i64 [[ID1]], ptr [[GEP1]]
+
+; CHECK: pred.store.if1:
+; CHECK: [[ID2:%.*]] = add i64 %{{.*}}, 1
+; CHECK: [[AS2:%.*]] = addrspacecast ptr addrspace(1) %in to ptr
+; CHECK: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[AS2]], i64 [[ID2]]
+; CHECK: store i64 [[ID2]], ptr %9, align 4
+
+  %cmp = icmp eq i64 %next, 7
+  br i1 %cmp, label %exit, label %loop
+
+; check that we branch to the exit block
+; CHECK: middle.block:
+; CHECK: br i1 true, label %exit, label %scalar.ph
+
+exit:
+  ret void
+; CHECK: exit:
+; CHECK:   ret void
+}
+
+; CHECK: !{{[0-9]*}} = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
index e227e9911bc345..ccdc007f674f4a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
@@ -85,8 +85,8 @@ define void @both_operands_need_extraction.4elts(<4 x ptr> %baseptrs, <4 x i64>
 
 ;-------------------------------------------------------------------------------
 
-define void @indicies_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.2elts(
+define void @indices_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.2elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <2 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <2 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
@@ -105,8 +105,8 @@ define void @indicies_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
   ret void
 }
 
-define void @indicies_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.3elts(
+define void @indices_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.3elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <3 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <3 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
@@ -130,8 +130,8 @@ define void @indicies_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
   ret void
 }
 
-define void @indicies_need_extraction.4elts(ptr %baseptr, <4 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.4elts(
+define void @indices_need_extraction.4elts(ptr %baseptr, <4 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.4elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <4 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <4 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 807071a7b9a16a..506e4f22ef8f54 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -569,9 +569,6 @@ int main(int argc, char **argv) {
       Str->initSections(true, *STI);
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   bool disassemble = false;
   switch (Action) {
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index 1cac576f54e77f..f1f39af059aa49 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -428,9 +428,6 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     Str->emitAssignment(Feat00Sym, MCConstantExpr::create(Feat00Flags, Ctx));
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   if (InputArgs.hasArg(OPT_as_lex)) {
     // -as-lex; Lex only, and output a stream of tokens
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index df48e9cc0ff4a4..8157e41e833a9c 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -288,6 +288,14 @@ std::string stringify(const omp::DirectiveWithClauses &DWC) {
 
 // --- Tests ----------------------------------------------------------
 
+namespace red {
+// Make it easier to construct reduction operators from built-in intrinsics.
+omp::clause::ReductionOperator
+makeOp(omp::clause::DefinedOperator::IntrinsicOperator Op) {
+  return omp::clause::ReductionOperator{omp::clause::DefinedOperator{Op}};
+}
+} // namespace red
+
 namespace {
 using namespace llvm::omp;
 
@@ -699,6 +707,92 @@ TEST_F(OpenMPDecompositionTest, Order1) {
 TEST_F(OpenMPDecompositionTest, Allocate1) {
   omp::Object x{"x"};
 
+  // Allocate + firstprivate
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                           // (33)
+  ASSERT_EQ(Dir1, "sections firstprivate(x) allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate2) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  // Allocate + in_reduction
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_in_reduction, omp::clause::InReduction{{{Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_parallel,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "target in_reduction((3), (x)) allocate(, , , (x))"); // (33)
+  ASSERT_EQ(Dir1, "parallel");                                          // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate3) {
+  omp::Object x{"x"};
+
+  // Allocate + linear
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_linear,
+       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_for,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  // The "shared" clause is duplicated---this isn't harmful, but it
+  // should be fixed eventually.
+  ASSERT_EQ(Dir0, "parallel shared(x) shared(x)"); // (33)
+  ASSERT_EQ(Dir1, "for linear(, , , (x)) firstprivate(x) lastprivate(, (x)) "
+                  "allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate4) {
+  omp::Object x{"x"};
+
+  // Allocate + lastprivate
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                              // (33)
+  ASSERT_EQ(Dir1, "sections lastprivate(, (x)) allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate5) {
+  omp::Object x{"x"};
+
+  // Allocate + private
   omp::List<omp::Clause> Clauses{
       {OMPC_allocate,
        omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
@@ -715,6 +809,27 @@ TEST_F(OpenMPDecompositionTest, Allocate1) {
   ASSERT_EQ(Dir1, "sections private(x) allocate(, , , (x))"); // (33)
 }
 
+TEST_F(OpenMPDecompositionTest, Allocate6) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  // Allocate + reduction
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                                 // (33)
+  ASSERT_EQ(Dir1, "sections reduction(, (3), (x)) allocate(, , , (x))"); // (33)
+}
+
 // REDUCTION
 // [5.2:134:17-18]
 // Directives: do, for, loop, parallel, scope, sections, simd, taskloop, teams
@@ -741,14 +856,6 @@ TEST_F(OpenMPDecompositionTest, Allocate1) {
 // clause on the construct, then the effect is as if the list item in the
 // reduction clause appears as a list item in a map clause with a map-type of
 // tofrom.
-namespace red {
-// Make is easier to construct reduction operators from built-in intrinsics.
-omp::clause::ReductionOperator
-makeOp(omp::clause::DefinedOperator::IntrinsicOperator Op) {
-  return omp::clause::ReductionOperator{omp::clause::DefinedOperator{Op}};
-}
-} // namespace red
-
 TEST_F(OpenMPDecompositionTest, Reduction1) {
   omp::Object x{"x"};
   auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
diff --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py
old mode 100644
new mode 100755
index dc054ab76a098e..837846db833215
--- a/llvm/utils/filecheck_lint/filecheck_lint.py
+++ b/llvm/utils/filecheck_lint/filecheck_lint.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
index 9dbfe0f94c1db3..c6fa142b376645 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -17,6 +17,8 @@ static_library("ProfileData") {
     "ItaniumManglingCanonicalizer.cpp",
     "MemProf.cpp",
     "MemProfReader.cpp",
+    "PGOCtxProfReader.cpp",
+    "PGOCtxProfWriter.cpp",
     "ProfileSummaryBuilder.cpp",
     "SampleProf.cpp",
     "SampleProfReader.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
index 4919a808920976..f45542519173ea 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
@@ -14,6 +14,7 @@ unittest("ProfileDataTests") {
     "InstrProfTest.cpp",
     "ItaniumManglingCanonicalizerTest.cpp",
     "MemProfTest.cpp",
+    "PGOCtxProfReaderWriterTest.cpp",
     "SampleProfTest.cpp",
     "SymbolRemappingReaderTest.cpp",
   ]
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
index b07addc77b56cc..a4f19981eec38b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -363,7 +363,6 @@ LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa,
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
       *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ false));
-  mcStreamer->setUseAssemblerInfoForParsing(true);
 
   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7b4507c52e02c7..511835a226e7a2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -3385,13 +3385,13 @@ struct Conv1DGenerator
       auto rhsSize = cast<VectorType>(rhs.getType()).getShape()[0];
       auto resSize = cast<VectorType>(res.getType()).getShape()[1];
 
-      SmallVector<int64_t, 16> indicies;
+      SmallVector<int64_t, 16> indices;
       for (int i = 0; i < resSize / rhsSize; ++i) {
         for (int j = 0; j < rhsSize; ++j)
-          indicies.push_back(j);
+          indices.push_back(j);
       }
 
-      rhs = rewriter.create<vector::ShuffleOp>(loc, rhs, rhs, indicies);
+      rhs = rewriter.create<vector::ShuffleOp>(loc, rhs, rhs, indices);
     }
     // Broadcast the filter to match the output vector
     rhs = rewriter.create<vector::BroadcastOp>(
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index eb09f007fbca88..247759e21efb16 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -648,13 +648,14 @@ LogicalResult transform::ApplyConversionPatternsOp::verify() {
     if (!llvm::hasSingleElement(typeConverterRegion.front()))
       return emitOpError()
              << "expected exactly one op in default type converter region";
+    Operation *maybeTypeConverter = &typeConverterRegion.front().front();
     auto typeConverterOp = dyn_cast<transform::TypeConverterBuilderOpInterface>(
-        &typeConverterRegion.front().front());
+        maybeTypeConverter);
     if (!typeConverterOp) {
       InFlightDiagnostic diag = emitOpError()
                                 << "expected default converter child op to "
                                    "implement TypeConverterBuilderOpInterface";
-      diag.attachNote(typeConverterOp->getLoc()) << "op without interface";
+      diag.attachNote(maybeTypeConverter->getLoc()) << "op without interface";
       return diag;
     }
     // Check default type converter type.
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index 66593fd8a55fa6..cc13e5b7436ea7 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -299,7 +299,6 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
       *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ false));
-  mcStreamer->setUseAssemblerInfoForParsing(true);
 
   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index f599f0ef4c4904..9cffa22c8ba702 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -17,26 +17,16 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   project(offload C CXX ASM)
 endif()
 
-set(ENABLE_LIBOMPTARGET ON)
-# Currently libomptarget cannot be compiled on Windows or MacOS X.
-# Since the device plugins are only supported on Linux anyway,
-# there is no point in trying to compile libomptarget on other OSes.
-# 32-bit systems are not supported either.
-if (APPLE OR WIN32 OR NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES OR NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-  set(ENABLE_LIBOMPTARGET OFF)
-endif()
-
-option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
-       ${ENABLE_LIBOMPTARGET})
-if (OPENMP_ENABLE_LIBOMPTARGET)
-  # Check that the library can actually be built.
-  if (APPLE OR WIN32)
-    message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
-  elseif (NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
-    message(FATAL_ERROR "Host compiler must support C++17 to build libomptarget!")
-  elseif (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    message(FATAL_ERROR "libomptarget on 32-bit systems are not supported!")
-  endif()
+# Check that the library can actually be built.
+if(APPLE OR WIN32 OR WASM)
+  message(WARNING "libomptarget cannot be built on Windows and MacOS X!")
+  return()
+elseif(NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
+  message(WARNING "Host compiler must support C++17 to build libomptarget!")
+  return()
+elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  message(WARNING "libomptarget on 32-bit systems is not supported!")
+  return()
 endif()
 
 if(OPENMP_STANDALONE_BUILD)
diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c
new file mode 100644
index 00000000000000..4b724823e7a40f
--- /dev/null
+++ b/offload/test/mapping/map_both_pointer_pointee.c
@@ -0,0 +1,42 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+// REQUIRES: unified_shared_memory
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#pragma omp declare target
+int *ptr1;
+#pragma omp end declare target
+
+#include <stdio.h>
+#include <stdlib.h>
+int main() {
+  ptr1 = (int *)malloc(sizeof(int) * 100);
+  int *ptr2;
+  ptr2 = (int *)malloc(sizeof(int) * 100);
+#pragma omp target map(ptr1, ptr1[ : 100])
+  { ptr1[1] = 6; }
+  // CHECK: 6
+  printf(" %d \n", ptr1[1]);
+#pragma omp target data map(ptr1[ : 5])
+  {
+#pragma omp target map(ptr1[2], ptr1, ptr1[3]) map(ptr2, ptr2[2])
+    {
+      ptr1[2] = 7;
+      ptr1[3] = 9;
+      ptr2[2] = 7;
+    }
+  }
+  // CHECK: 7 7 9
+  printf(" %d %d %d \n", ptr2[2], ptr1[2], ptr1[3]);
+  free(ptr1);
+#pragma omp target map(ptr2, ptr2[ : 100])
+  { ptr2[1] = 6; }
+  // CHECK: 6
+  printf(" %d \n", ptr2[1]);
+  free(ptr2);
+  return 0;
+}