diff --git a/deps/llvm.mk b/deps/llvm.mk
index e3c9b35626e18..37e3b1f62b52a 100644
--- a/deps/llvm.mk
+++ b/deps/llvm.mk
@@ -78,7 +78,11 @@ LLVM_CFLAGS += $(CFLAGS)
 LLVM_CXXFLAGS += $(CXXFLAGS)
 LLVM_CPPFLAGS += $(CPPFLAGS)
 LLVM_LDFLAGS += $(LDFLAGS)
-LLVM_TARGETS := host
+ifeq ($(LLVM_USE_CMAKE),1)
+LLVM_TARGETS := host;NVPTX
+else
+LLVM_TARGETS := host,nvptx
+endif
 LLVM_TARGET_FLAGS := --enable-targets=$(LLVM_TARGETS)
 LLVM_CMAKE += -DLLVM_TARGETS_TO_BUILD:STRING="$(LLVM_TARGETS)" -DCMAKE_BUILD_TYPE="$(LLVM_CMAKE_BUILDTYPE)"
 LLVM_CMAKE += -DLLVM_TOOLS_INSTALL_DIR=$(shell $(JULIAHOME)/contrib/relative_path.sh $(build_prefix) $(build_depsbindir))
@@ -479,6 +483,10 @@ $(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0
 $(eval $(call LLVM_PATCH,llvm-arm-fix-prel31))
 $(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib))
+# patches for NVPTX
+$(eval $(call LLVM_PATCH,llvm-D9168_argument_alignment)) # Remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-D23597_sdag_names)) # Dep for D24300, remove for 4.0
+$(eval $(call LLVM_PATCH,llvm-D24300_ptx_intrinsics)) # Remove for 4.0
 endif # LLVM_VER
 
 ifeq ($(LLVM_VER),3.7.1)
diff --git a/deps/patches/llvm-D23597_sdag_names.patch b/deps/patches/llvm-D23597_sdag_names.patch
new file mode 100644
index 0000000000000..9eea510f7d62f
--- /dev/null
+++ b/deps/patches/llvm-D23597_sdag_names.patch
@@ -0,0 +1,796 @@
+Index: include/llvm/Target/TargetSelectionDAG.td
+===================================================================
+--- a/include/llvm/Target/TargetSelectionDAG.td
++++ b/include/llvm/Target/TargetSelectionDAG.td
+@@ -450,10 +450,10 @@
+ def fceil      : SDNode<"ISD::FCEIL"      , SDTFPUnaryOp>;
+ def ffloor     : SDNode<"ISD::FFLOOR"     , SDTFPUnaryOp>;
+ def fnearbyint : SDNode<"ISD::FNEARBYINT" , SDTFPUnaryOp>;
+-def frnd       : SDNode<"ISD::FROUND"     , SDTFPUnaryOp>;
++def fround     : SDNode<"ISD::FROUND"     , SDTFPUnaryOp>;
+ 
+-def fround     : SDNode<"ISD::FP_ROUND"   , SDTFPRoundOp>;
+-def fextend    : SDNode<"ISD::FP_EXTEND"  , SDTFPExtendOp>;
++def fpround    : SDNode<"ISD::FP_ROUND"   , SDTFPRoundOp>;
++def fpextend   : SDNode<"ISD::FP_EXTEND"  , SDTFPExtendOp>;
+ def fcopysign  : SDNode<"ISD::FCOPYSIGN"  , SDTFPSignOp>;
+ 
+ def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>;
+Index: lib/Target/AArch64/AArch64InstrFormats.td
+===================================================================
+--- a/lib/Target/AArch64/AArch64InstrFormats.td
++++ b/lib/Target/AArch64/AArch64InstrFormats.td
+@@ -3936,27 +3936,27 @@
+ multiclass FPConversion<string asm> {
+   // Double-precision to Half-precision
+   def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+-                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
++                             [(set FPR16:$Rd, (fpround FPR64:$Rn))]>;
+ 
+   // Double-precision to Single-precision
+   def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+-                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
++                             [(set FPR32:$Rd, (fpround FPR64:$Rn))]>;
+ 
+   // Half-precision to Double-precision
+   def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+-                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
++                             [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+ 
+   // Half-precision to Single-precision
+   def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+-                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
++                             [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+ 
+   // Single-precision to Double-precision
+   def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+-                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
++                             [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+ 
+   // Single-precision to Half-precision
+   def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+-                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
++                             [(set FPR16:$Rd, (fpround FPR32:$Rn))]>;
+ }
+ 
+ //---
+Index: lib/Target/AArch64/AArch64InstrInfo.td
+===================================================================
+--- a/lib/Target/AArch64/AArch64InstrInfo.td
++++ b/lib/Target/AArch64/AArch64InstrInfo.td
+@@ -2545,8 +2545,8 @@
+ defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+ defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+ defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+-defm : FPToIntegerPats<fp_to_sint, frnd,   "FCVTAS">;
+-defm : FPToIntegerPats<fp_to_uint, frnd,   "FCVTAU">;
++defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
++defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+ 
+ //===----------------------------------------------------------------------===//
+ // Scaled integer to floating point conversion instructions.
+@@ -2582,7 +2582,7 @@
+ defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+ defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+ defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
++defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
+ defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+ defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+ defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+@@ -2788,13 +2788,13 @@
+ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                               (i64 4)))),
+           (FCVTLv8i16 V128:$Rn)>;
+-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
++def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
++def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                     (i64 2))))),
+           (FCVTLv4i32 V128:$Rn)>;
+ 
+-def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+-def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
++def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
++def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+                                                     (i64 4))))),
+           (FCVTLv8i16 V128:$Rn)>;
+ 
+@@ -2808,9 +2808,9 @@
+ def : Pat<(concat_vectors V64:$Rd,
+                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+-def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
++def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
++def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
++def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+ defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+ defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+@@ -2833,7 +2833,7 @@
+ 
+ defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+ defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
++defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
+ defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+ defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+ defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+Index: lib/Target/AMDGPU/SIInstructions.td
+===================================================================
+--- a/lib/Target/AMDGPU/SIInstructions.td
++++ b/lib/Target/AMDGPU/SIInstructions.td
+@@ -1107,10 +1107,10 @@
+   VOP_I32_F32, cvt_flr_i32_f32>;
+ defm V_CVT_OFF_F32_I4 : VOP1Inst  <vop1<0x0e>, "v_cvt_off_f32_i4", VOP_F32_I32>;
+ defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
+-  VOP_F32_F64, fround
++  VOP_F32_F64, fpround
+ >;
+ defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
+-  VOP_F64_F32, fextend
++  VOP_F64_F32, fpextend
+ >;
+ defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
+   VOP_F32_I32, AMDGPUcvt_f32_ubyte0
+Index: lib/Target/ARM/ARMInstrVFP.td
+===================================================================
+--- a/lib/Target/ARM/ARMInstrVFP.td
++++ b/lib/Target/ARM/ARMInstrVFP.td
+@@ -624,7 +624,7 @@
+ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+                    (outs DPR:$Dd), (ins SPR:$Sm),
+                    IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+-                   [(set DPR:$Dd, (fextend SPR:$Sm))]> {
++                   [(set DPR:$Dd, (fpextend SPR:$Sm))]> {
+   // Instruction operands.
+   bits<5> Dd;
+   bits<5> Sm;
+@@ -641,7 +641,7 @@
+ // Special case encoding: bits 11-8 is 0b1011.
+ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+                     IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+-                    [(set SPR:$Sd, (fround DPR:$Dm))]> {
++                    [(set SPR:$Sd, (fpround DPR:$Dm))]> {
+   // Instruction operands.
+   bits<5> Sd;
+   bits<5> Dm;
+@@ -838,7 +838,7 @@
+   }
+ }
+ 
+-defm VCVTA : vcvt_inst<"a", 0b00, frnd>;
++defm VCVTA : vcvt_inst<"a", 0b00, fround>;
+ defm VCVTN : vcvt_inst<"n", 0b01>;
+ defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
+ defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
+@@ -938,7 +938,7 @@
+         Requires<[HasFPARMv8,HasDPVFP]>;
+ }
+ 
+-defm VRINTA : vrint_inst_anpm<"a", 0b00, frnd>;
++defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
+ defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+ defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
+ defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+Index: lib/Target/Hexagon/HexagonISelLowering.cpp
+===================================================================
+--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
++++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
+@@ -1906,7 +1906,7 @@
+   }
+   // Turn FP truncstore into trunc + store.
+   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+-  // Turn FP extload into load/fextend.
++  // Turn FP extload into load/fpextend.
+   for (MVT VT : MVT::fp_valuetypes())
+     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ 
+Index: lib/Target/Hexagon/HexagonInstrInfoV5.td
+===================================================================
+--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
++++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
+@@ -564,10 +564,10 @@
+ 
+ // Convert single precision to double precision and vice-versa.
+ def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000,
+-                                       fextend, F64, F32>;
++                                       fpextend, F64, F32>;
+ 
+ def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000,
+-                                       fround, F32, F64>;
++                                       fpround, F32, F64>;
+ 
+ // Convert Integer to Floating Point.
+ def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010,
+Index: lib/Target/Mips/MipsInstrFPU.td
+===================================================================
+--- a/lib/Target/Mips/MipsInstrFPU.td
++++ b/lib/Target/Mips/MipsInstrFPU.td
+@@ -635,9 +635,9 @@
+               (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+ def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+               (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+-def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
++def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+               (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+-def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
++def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+               (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+ 
+ def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+@@ -657,9 +657,9 @@
+ def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+               (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+ 
+-def : MipsPat<(f32 (fround FGR64Opnd:$src)),
++def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+               (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+-def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
++def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+               (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+ 
+ // Patterns for loads/stores with a reg+imm operand.
+Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
+@@ -206,7 +206,7 @@
+   // intrinsics.
+   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ 
+-  // Turn FP extload into load/fextend
++  // Turn FP extload into load/fpextend
+   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+Index: lib/Target/NVPTX/NVPTXInstrInfo.td
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
++++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
+@@ -2613,16 +2613,16 @@
+ def : Pat<(ctpop Int16Regs:$a),
+           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+ 
+-// fround f64 -> f32
+-def : Pat<(f32 (fround Float64Regs:$a)),
++// fpround f64 -> f32
++def : Pat<(f32 (fpround Float64Regs:$a)),
+           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+-def : Pat<(f32 (fround Float64Regs:$a)),
++def : Pat<(f32 (fpround Float64Regs:$a)),
+           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+ 
+-// fextend f32 -> f64
+-def : Pat<(f64 (fextend Float32Regs:$a)),
++// fpextend f32 -> f64
++def : Pat<(f64 (fpextend Float32Regs:$a)),
+           (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+-def : Pat<(f64 (fextend Float32Regs:$a)),
++def : Pat<(f64 (fpextend Float32Regs:$a)),
+           (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+ 
+ def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+Index: lib/Target/PowerPC/PPCInstrInfo.td
+===================================================================
+--- a/lib/Target/PowerPC/PPCInstrInfo.td
++++ b/lib/Target/PowerPC/PPCInstrInfo.td
+@@ -2110,15 +2110,15 @@
+ 
+   defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+                           "frsp", "$frD, $frB", IIC_FPGeneral,
+-                          [(set f32:$frD, (fround f64:$frB))]>;
++                          [(set f32:$frD, (fpround f64:$frB))]>;
+ 
+   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
+                           "frin", "$frD, $frB", IIC_FPGeneral,
+-                          [(set f64:$frD, (frnd f64:$frB))]>;
++                          [(set f64:$frD, (fround f64:$frB))]>;
+   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
+                           "frin", "$frD, $frB", IIC_FPGeneral,
+-                          [(set f32:$frD, (frnd f32:$frB))]>;
++                          [(set f32:$frD, (fround f32:$frB))]>;
+   }
+ 
+   let hasSideEffects = 0 in {
+@@ -2856,7 +2856,7 @@
+ def : Pat<(f64 (extloadf32 xaddr:$src)),
+           (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+ 
+-def : Pat<(f64 (fextend f32:$src)),
++def : Pat<(f64 (fpextend f32:$src)),
+           (COPY_TO_REGCLASS $src, F8RC)>;
+ 
+ // Only seq_cst fences require the heavyweight sync (SYNC 0).
+Index: lib/Target/PowerPC/PPCInstrQPX.td
+===================================================================
+--- a/lib/Target/PowerPC/PPCInstrQPX.td
++++ b/lib/Target/PowerPC/PPCInstrQPX.td
+@@ -88,11 +88,11 @@
+   return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+ }]>;
+ 
+-def fround_inexact : PatFrag<(ops node:$val), (fround node:$val), [{
++def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{
+   return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
+ }]>;
+ 
+-def fround_exact : PatFrag<(ops node:$val), (fround node:$val), [{
++def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{
+   return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
+ }]>;
+ 
+@@ -311,11 +311,11 @@
+ 
+   def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                         "qvfrin $FRT, $FRB", IIC_FPGeneral,
+-                        [(set v4f64:$FRT, (frnd v4f64:$FRB))]>;
++                        [(set v4f64:$FRT, (fround v4f64:$FRB))]>;
+   let isCodeGenOnly = 1 in
+     def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                            "qvfrin $FRT, $FRB", IIC_FPGeneral,
+-                           [(set v4f32:$FRT, (frnd v4f32:$FRB))]>;
++                           [(set v4f32:$FRT, (fround v4f32:$FRB))]>;
+ 
+   def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                         "qvfrip $FRT, $FRB", IIC_FPGeneral,
+@@ -1103,7 +1103,7 @@
+ def : Pat<(not v4i1:$FRA),
+           (QVFLOGICALb $FRA, $FRA, (i32 10))>;
+ 
+-def : Pat<(v4f64 (fextend v4f32:$src)),
++def : Pat<(v4f64 (fpextend v4f32:$src)),
+           (COPY_TO_REGCLASS $src, QFRC)>;
+ 
+ def : Pat<(v4f32 (fround_exact v4f64:$src)),
+Index: lib/Target/PowerPC/PPCInstrVSX.td
+===================================================================
+--- a/lib/Target/PowerPC/PPCInstrVSX.td
++++ b/lib/Target/PowerPC/PPCInstrVSX.td
+@@ -634,7 +634,7 @@
+   def XSRDPI : XX2Form<60, 73,
+                       (outs vsfrc:$XT), (ins vsfrc:$XB),
+                       "xsrdpi $XT, $XB", IIC_VecFP,
+-                      [(set f64:$XT, (frnd f64:$XB))]>;
++                      [(set f64:$XT, (fround f64:$XB))]>;
+   def XSRDPIC : XX2Form<60, 107,
+                       (outs vsfrc:$XT), (ins vsfrc:$XB),
+                       "xsrdpic $XT, $XB", IIC_VecFP,
+@@ -655,7 +655,7 @@
+   def XVRDPI : XX2Form<60, 201,
+                       (outs vsrc:$XT), (ins vsrc:$XB),
+                       "xvrdpi $XT, $XB", IIC_VecFP,
+-                      [(set v2f64:$XT, (frnd v2f64:$XB))]>;
++                      [(set v2f64:$XT, (fround v2f64:$XB))]>;
+   def XVRDPIC : XX2Form<60, 235,
+                       (outs vsrc:$XT), (ins vsrc:$XB),
+                       "xvrdpic $XT, $XB", IIC_VecFP,
+@@ -676,7 +676,7 @@
+   def XVRSPI : XX2Form<60, 137,
+                       (outs vsrc:$XT), (ins vsrc:$XB),
+                       "xvrspi $XT, $XB", IIC_VecFP,
+-                      [(set v4f32:$XT, (frnd v4f32:$XB))]>;
++                      [(set v4f32:$XT, (fround v4f32:$XB))]>;
+   def XVRSPIC : XX2Form<60, 171,
+                       (outs vsrc:$XT), (ins vsrc:$XB),
+                       "xvrspic $XT, $XB", IIC_VecFP,
+@@ -1108,7 +1108,7 @@
+ 
+   def : Pat<(f64 (extloadf32 xoaddr:$src)),
+             (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
+-  def : Pat<(f64 (fextend f32:$src)),
++  def : Pat<(f64 (fpextend f32:$src)),
+             (COPY_TO_REGCLASS $src, VSFRC)>;
+ 
+   def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+Index: lib/Target/Sparc/SparcISelLowering.cpp
+===================================================================
+--- a/lib/Target/Sparc/SparcISelLowering.cpp
++++ b/lib/Target/Sparc/SparcISelLowering.cpp
+@@ -1508,7 +1508,7 @@
+     //    AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+   }
+ 
+-  // Turn FP extload into load/fextend
++  // Turn FP extload into load/fpextend
+   for (MVT VT : MVT::fp_valuetypes()) {
+     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+Index: lib/Target/Sparc/SparcInstrInfo.td
+===================================================================
+--- a/lib/Target/Sparc/SparcInstrInfo.td
++++ b/lib/Target/Sparc/SparcInstrInfo.td
+@@ -1131,32 +1131,32 @@
+ def FSTOD : F3_3u<2, 0b110100, 0b011001001,
+                  (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                  "fstod $rs2, $rd",
+-                 [(set f64:$rd, (fextend f32:$rs2))],
++                 [(set f64:$rd, (fpextend f32:$rs2))],
+                  IIC_fpu_stod>;
+ def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                  (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                  "fstoq $rs2, $rd",
+-                 [(set f128:$rd, (fextend f32:$rs2))]>,
++                 [(set f128:$rd, (fpextend f32:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+ def FDTOS : F3_3u<2, 0b110100, 0b011000110,
+                  (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                  "fdtos $rs2, $rd",
+-                 [(set f32:$rd, (fround f64:$rs2))],
++                 [(set f32:$rd, (fpround f64:$rs2))],
+                  IIC_fpu_fast_instr>;
+ def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
+                  (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                  "fdtoq $rs2, $rd",
+-                 [(set f128:$rd, (fextend f64:$rs2))]>,
++                 [(set f128:$rd, (fpextend f64:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+ def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                  (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                  "fqtos $rs2, $rd",
+-                 [(set f32:$rd, (fround f128:$rs2))]>,
++                 [(set f32:$rd, (fpround f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+ def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                  (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                  "fqtod $rs2, $rd",
+-                 [(set f64:$rd, (fround f128:$rs2))]>,
++                 [(set f64:$rd, (fpround f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+ 
+ // Floating-point Move Instructions, p. 144
+@@ -1255,14 +1255,14 @@
+ def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                   (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fsmuld $rs1, $rs2, $rd",
+-                  [(set f64:$rd, (fmul (fextend f32:$rs1),
+-                                        (fextend f32:$rs2)))],
++                  [(set f64:$rd, (fmul (fpextend f32:$rs1),
++                                        (fpextend f32:$rs2)))],
+                   IIC_fpu_muld>;
+ def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                   (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fdmulq $rs1, $rs2, $rd",
+-                  [(set f128:$rd, (fmul (fextend f64:$rs1),
+-                                         (fextend f64:$rs2)))]>,
++                  [(set f128:$rd, (fmul (fpextend f64:$rs1),
++                                         (fpextend f64:$rs2)))]>,
+                   Requires<[HasHardQuad]>;
+ 
+ // FDIVS generates an erratum on LEON processors, so by disabling this instruction
+Index: lib/Target/SystemZ/SystemZISelLowering.cpp
+===================================================================
+--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
++++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
+@@ -4995,8 +4995,8 @@
+ 
+ SDValue SystemZTargetLowering::combineFP_ROUND(
+     SDNode *N, DAGCombinerInfo &DCI) const {
+-  // (fround (extract_vector_elt X 0))
+-  // (fround (extract_vector_elt X 1)) ->
++  // (fpround (extract_vector_elt X 0))
++  // (fpround (extract_vector_elt X 1)) ->
+   // (extract_vector_elt (VROUND X) 0)
+   // (extract_vector_elt (VROUND X) 1)
+   //
+Index: lib/Target/SystemZ/SystemZInstrFP.td
+===================================================================
+--- a/lib/Target/SystemZ/SystemZInstrFP.td
++++ b/lib/Target/SystemZ/SystemZInstrFP.td
+@@ -154,7 +154,7 @@
+ // Convert floating-point values to narrower representations, rounding
+ // according to the current mode.  The destination of LEXBR and LDXBR
+ // is a 128-bit value, but only the first register of the pair is used.
+-def LEDBR : UnaryRRE<"ledb", 0xB344, fround,    FP32,  FP64>;
++def LEDBR : UnaryRRE<"ledb", 0xB344, fpround,    FP32,  FP64>;
+ def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
+ def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
+ 
+@@ -165,15 +165,15 @@
+ def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>,
+              Requires<[FeatureFPExtension]>;
+ 
+-def : Pat<(f32 (fround FP128:$src)),
++def : Pat<(f32 (fpround FP128:$src)),
+           (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+-def : Pat<(f64 (fround FP128:$src)),
++def : Pat<(f64 (fpround FP128:$src)),
+           (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+ 
+ // Extend register floating-point values to wider representations.
+-def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64,  FP32>;
+-def LXEBR : UnaryRRE<"lxeb", 0xB306, fextend, FP128, FP32>;
+-def LXDBR : UnaryRRE<"lxdb", 0xB305, fextend, FP128, FP64>;
++def LDEBR : UnaryRRE<"ldeb", 0xB304, fpextend, FP64,  FP32>;
++def LXEBR : UnaryRRE<"lxeb", 0xB306, fpextend, FP128, FP32>;
++def LXDBR : UnaryRRE<"lxdb", 0xB305, fpextend, FP128, FP64>;
+ 
+ // Extend memory floating-point values to wider representations.
+ def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
+@@ -347,9 +347,9 @@
+ 
+   // Same idea for round, where mode 1 is round towards nearest with
+   // ties away from zero.
+-  def : Pat<(frnd FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+-  def : Pat<(frnd FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+-  def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
++  def : Pat<(fround FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
++  def : Pat<(fround FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
++  def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+ }
+ 
+ //===----------------------------------------------------------------------===//
+@@ -388,26 +388,26 @@
+ 
+ // f64 multiplication of two FP32 registers.
+ def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
+-def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
++def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))),
+           (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                 FP32:$src1, subreg_r32), FP32:$src2)>;
+ 
+ // f64 multiplication of an FP32 register and an f32 memory.
+ def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+-def : Pat<(fmul (f64 (fextend FP32:$src1)),
++def : Pat<(fmul (f64 (fpextend FP32:$src1)),
+                 (f64 (extloadf32 bdxaddr12only:$addr))),
+           (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_r32),
+                 bdxaddr12only:$addr)>;
+ 
+ // f128 multiplication of two FP64 registers.
+ def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>;
+-def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
++def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+           (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                 FP64:$src1, subreg_h64), FP64:$src2)>;
+ 
+ // f128 multiplication of an FP64 register and an f64 memory.
+ def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+-def : Pat<(fmul (f128 (fextend FP64:$src1)),
++def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                 (f128 (extloadf64 bdxaddr12only:$addr))),
+           (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                 bdxaddr12only:$addr)>;
+Index: lib/Target/SystemZ/SystemZInstrVector.td
+===================================================================
+--- a/lib/Target/SystemZ/SystemZInstrVector.td
++++ b/lib/Target/SystemZ/SystemZInstrVector.td
+@@ -798,7 +798,7 @@
+   def : FPConversion<insn, ffloor,     tr, tr, 4, 7>;
+   def : FPConversion<insn, fceil,      tr, tr, 4, 6>;
+   def : FPConversion<insn, ftrunc,     tr, tr, 4, 5>;
+-  def : FPConversion<insn, frnd,       tr, tr, 4, 1>;
++  def : FPConversion<insn, fround,     tr, tr, 4, 1>;
+ }
+ 
+ let Predicates = [FeatureVector] in {
+@@ -840,13 +840,13 @@
+ 
+   // Load lengthened.
+   def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
+-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
++  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+ 
+   // Load rounded,
+   def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
+   def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+-  def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
++  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+ 
+   // Multiply.
+   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
+Index: lib/Target/WebAssembly/WebAssemblyInstrConv.td
+===================================================================
+--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
++++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+@@ -89,10 +89,10 @@
+                           "f64.convert_u/i64\t$dst, $src">;
+ 
+ def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
+-                        [(set F64:$dst, (fextend F32:$src))],
++                        [(set F64:$dst, (fpextend F32:$src))],
+                         "f64.promote/f32\t$dst, $src">;
+ def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
+-                       [(set F32:$dst, (fround F64:$src))],
++                       [(set F32:$dst, (fpround F64:$src))],
+                        "f32.demote/f64\t$dst, $src">;
+ 
+ def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
+Index: lib/Target/X86/X86InstrAVX512.td
+===================================================================
+--- a/lib/Target/X86/X86InstrAVX512.td
++++ b/lib/Target/X86/X86InstrAVX512.td
+@@ -5595,11 +5595,11 @@
+ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext,
+                                           X86fpextRnd,f32x_info, f64x_info >;
+ 
+-def : Pat<(f64 (fextend FR32X:$src)),
++def : Pat<(f64 (fpextend FR32X:$src)),
+           (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
+                                (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+           Requires<[HasAVX512]>;
+-def : Pat<(f64 (fextend (loadf32 addr:$src))),
++def : Pat<(f64 (fpextend (loadf32 addr:$src))),
+           (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+           Requires<[HasAVX512]>;
+ 
+@@ -5612,7 +5612,7 @@
+                     (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+           Requires<[HasAVX512, OptForSpeed]>;
+ 
+-def : Pat<(f32 (fround FR64X:$src)),
++def : Pat<(f32 (fpround FR64X:$src)),
+           (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
+                     (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+            Requires<[HasAVX512]>;
+@@ -5666,29 +5666,29 @@
+ // Extend Float to Double
+ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+   let Predicates = [HasAVX512] in {
+-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
++    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
+              avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+                                 X86vfpextRnd>, EVEX_V512;
+   }
+   let Predicates = [HasVLX] in {
+     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+                                X86vfpext, "{1to2}">, EVEX_V128;
+-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
++    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
+                                      EVEX_V256;
+   }
+ }
+ 
+ // Truncate Double to Float
+ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+   let Predicates = [HasAVX512] in {
+-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
++    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
+              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                                X86vfproundRnd>, EVEX_V512;
+   }
+   let Predicates = [HasVLX] in {
+     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+                                X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
++    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
+                                "{1to4}", "{y}">, EVEX_V256;
+   }
+ }
+@@ -6025,7 +6025,7 @@
+ }
+ 
+ let Predicates = [HasAVX512] in {
+-  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
++  def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
+             (VCVTPD2PSZrm addr:$src)>;
+   def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+             (VCVTPS2PDZrm addr:$src)>;
+Index: lib/Target/X86/X86InstrFPStack.td
+===================================================================
+--- a/lib/Target/X86/X86InstrFPStack.td
++++ b/lib/Target/X86/X86InstrFPStack.td
+@@ -711,19 +711,19 @@
+ 
+ // FP extensions map onto simple pseudo-value conversions if they are to/from
+ // the FP stack.
+-def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
++def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+           Requires<[FPStackf32]>;
+-def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
++def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+            Requires<[FPStackf32]>;
+-def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
++def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+            Requires<[FPStackf64]>;
+ 
+ // FP truncations map onto simple pseudo-value conversions if they are to/from
+ // the FP stack.  We have validated that only value-preserving truncations make
+ // it through isel.
+-def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
++def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+           Requires<[FPStackf32]>;
+-def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
++def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+            Requires<[FPStackf32]>;
+-def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
++def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+            Requires<[FPStackf64]>;
+Index: lib/Target/X86/X86InstrSSE.td
+===================================================================
+--- a/lib/Target/X86/X86InstrSSE.td
++++ b/lib/Target/X86/X86InstrSSE.td
+@@ -1799,16 +1799,16 @@
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ }
+ 
+-def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
++def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+           Requires<[UseAVX]>;
+ 
+ def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
+-                      [(set FR32:$dst, (fround FR64:$src))],
++                      [(set FR32:$dst, (fpround FR64:$src))],
+                       IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
+-                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
++                      [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
+                       IIC_SSE_CVT_Scalar_RM>,
+                       XD,
+                   Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+@@ -1865,9 +1865,9 @@
+                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ }
+ 
+-def : Pat<(f64 (fextend FR32:$src)),
++def : Pat<(f64 (fpextend FR32:$src)),
+     (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+-def : Pat<(fextend (loadf32 addr:$src)),
++def : Pat<(fpextend (loadf32 addr:$src)),
+     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+ 
+ def : Pat<(extloadf32 addr:$src),
+@@ -1879,21 +1879,21 @@
+ 
+ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                    "cvtss2sd\t{$src, $dst|$dst, $src}",
+-                   [(set FR64:$dst, (fextend FR32:$src))],
++                   [(set FR64:$dst, (fpextend FR32:$src))],
+                    IIC_SSE_CVT_Scalar_RR>, XS,
+                  Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                    "cvtss2sd\t{$src, $dst|$dst, $src}",
+                    [(set FR64:$dst, (extloadf32 addr:$src))],
+                    IIC_SSE_CVT_Scalar_RM>, XS,
+                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+ 
+-// extload f32 -> f64.  This matches load+fextend because we have a hack in
++// extload f32 -> f64.  This matches load+fpextend because we have a hack in
+ // the isel (PreprocessForFPConvert) that can introduce loads after dag
+ // combine.
+-// Since these loads aren't folded into the fextend, we have to match it
++// Since these loads aren't folded into the fpextend, we have to match it
+ // explicitly here.
+-def : Pat<(fextend (loadf32 addr:$src)),
++def : Pat<(fpextend (loadf32 addr:$src)),
+           (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+ def : Pat<(extloadf32 addr:$src),
+           (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+@@ -2269,26 +2269,26 @@
+ }
+ 
+ let Predicates = [HasAVX, NoVLX] in {
+-  // Match fround and fextend for 128/256-bit conversions
++  // Match fpround and fpextend for 128/256-bit conversions
+   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+             (VCVTPD2PSrr VR128:$src)>;
+   def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
+             (VCVTPD2PSXrm addr:$src)>;
+-  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
++  def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
+             (VCVTPD2PSYrr VR256:$src)>;
+-  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
++  def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
+             (VCVTPD2PSYrm addr:$src)>;
+ 
+   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+             (VCVTPS2PDrr VR128:$src)>;
+-  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
++  def : Pat<(v4f64 (fpextend (v4f32 VR128:$src))),
+             (VCVTPS2PDYrr VR128:$src)>;
+   def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+             (VCVTPS2PDYrm addr:$src)>;
+ }
+ 
+ let Predicates = [UseSSE2] in {
+-  // Match fround and fextend for 128 conversions
++  // Match fpround and fpextend for 128 conversions
+   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+             (CVTPD2PSrr VR128:$src)>;
+   def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
diff --git a/deps/patches/llvm-D24300_ptx_intrinsics.patch b/deps/patches/llvm-D24300_ptx_intrinsics.patch
new file mode 100644
index 0000000000000..e0c1e5a286c56
--- /dev/null
+++ b/deps/patches/llvm-D24300_ptx_intrinsics.patch
@@ -0,0 +1,506 @@
+Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
+@@ -279,6 +279,28 @@
+   setTargetDAGCombine(ISD::SHL);
+   setTargetDAGCombine(ISD::SELECT);
+ 
++  // Library functions.  These default to Expand, but we have instructions
++  // for them.
++  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
++  setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
++  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
++  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
++  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
++  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
++  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
++  setOperationAction(ISD::FRINT,  MVT::f64, Legal);
++  setOperationAction(ISD::FROUND, MVT::f32, Legal);
++  setOperationAction(ISD::FROUND, MVT::f64, Legal);
++  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
++  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
++  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
++  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
++  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
++  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
++
++  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
++  // No FPOW or FREM in PTX.
++
+   // Now deduce the information based on the above mentioned
+   // actions
+   computeRegisterProperties(STI.getRegisterInfo());
+Index: lib/Target/NVPTX/NVPTXInstrInfo.td
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
++++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
+@@ -207,15 +207,63 @@
+ }
+ 
+ // Template for instructions which take three fp64 or fp32 args.  The
+-// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
++// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+ //
+ // Also defines ftz (flush subnormal inputs and results to sign-preserving
+ // zero) variants for fp32 functions.
++//
++// This multiclass should be used for nodes that cannot be folded into FMAs.
++// For nodes that can be folded into FMAs (i.e. adds and muls), use
++// F3_fma_component.
+ multiclass F3<string OpcStr, SDNode OpNode> {
+    def f64rr :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, Float64Regs:$b),
+                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
++               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
++   def f64ri :
++     NVPTXInst<(outs Float64Regs:$dst),
++               (ins Float64Regs:$a, f64imm:$b),
++               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
++               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
++   def f32rr_ftz :
++     NVPTXInst<(outs Float32Regs:$dst),
++               (ins Float32Regs:$a, Float32Regs:$b),
++               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
++               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
++               Requires<[doF32FTZ]>;
++   def f32ri_ftz :
++     NVPTXInst<(outs Float32Regs:$dst),
++               (ins Float32Regs:$a, f32imm:$b),
++               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
++               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
++               Requires<[doF32FTZ]>;
++   def f32rr :
++     NVPTXInst<(outs Float32Regs:$dst),
++               (ins Float32Regs:$a, Float32Regs:$b),
++               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
++               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
++   def f32ri :
++     NVPTXInst<(outs Float32Regs:$dst),
++               (ins Float32Regs:$a, f32imm:$b),
++               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
++               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
++}
++
++// Template for instructions which take three fp64 or fp32 args.  The
++// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
++//
++// Also defines ftz (flush subnormal inputs and results to sign-preserving
++// zero) variants for fp32 functions.
++//
++// This multiclass should be used for nodes that can be folded to make fma ops.
++// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
++// just like the non ".rn" op, but prevents ptxas from creating FMAs.
++multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
++   def f64rr :
++     NVPTXInst<(outs Float64Regs:$dst),
++               (ins Float64Regs:$a, Float64Regs:$b),
++               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                Requires<[allowFMA]>;
+    def f64ri :
+@@ -248,41 +296,39 @@
+                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[allowFMA]>;
+-}
+ 
+-// Same as F3, but defines ".rn" variants (round to nearest even).
+-multiclass F3_rn<string OpcStr, SDNode OpNode> {
+-   def f64rr :
++   // These have strange names so we don't perturb existing mir tests.
++   def _rnf64rr :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, Float64Regs:$b),
+                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                Requires<[noFMA]>;
+-   def f64ri :
++   def _rnf64ri :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, f64imm:$b),
+                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+                Requires<[noFMA]>;
+-   def f32rr_ftz :
++   def _rnf32rr_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[noFMA, doF32FTZ]>;
+-   def f32ri_ftz :
++   def _rnf32ri_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[noFMA, doF32FTZ]>;
+-   def f32rr :
++   def _rnf32rr :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[noFMA]>;
+-   def f32ri :
++   def _rnf32ri :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+@@ -713,13 +759,12 @@
+          N->getValueAPF().convertToDouble() == 1.0;
+ }]>;
+ 
+-defm FADD : F3<"add", fadd>;
+-defm FSUB : F3<"sub", fsub>;
+-defm FMUL : F3<"mul", fmul>;
+-
+-defm FADD_rn : F3_rn<"add", fadd>;
+-defm FSUB_rn : F3_rn<"sub", fsub>;
+-defm FMUL_rn : F3_rn<"mul", fmul>;
++defm FADD : F3_fma_component<"add", fadd>;
++defm FSUB : F3_fma_component<"sub", fsub>;
++defm FMUL : F3_fma_component<"mul", fmul>;
++
++defm FMIN : F3<"min", fminnum>;
++defm FMAX : F3<"max", fmaxnum>;
+ 
+ defm FABS  : F2<"abs", fabs>;
+ defm FNEG  : F2<"neg", fneg>;
+@@ -2628,6 +2673,55 @@
+ def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue]>;
+ 
++// fceil, ffloor, fround, ftrunc.
++
++def : Pat<(fceil Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(fceil Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(fceil Float64Regs:$a),
++          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
++
++def : Pat<(ffloor Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(ffloor Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(ffloor Float64Regs:$a),
++          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
++
++def : Pat<(fround Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(f32 (fround Float32Regs:$a)),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(f64 (fround Float64Regs:$a)),
++          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
++
++def : Pat<(ftrunc Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(ftrunc Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(ftrunc Float64Regs:$a),
++          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
++
++// nearbyint and rint are implemented as rounding to nearest even.  This isn't
++// strictly correct, because it causes us to ignore the rounding mode.  But it
++// matches what CUDA's "libm" does.
++
++def : Pat<(fnearbyint Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(fnearbyint Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(fnearbyint Float64Regs:$a),
++          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
++
++def : Pat<(frint Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
++def : Pat<(frint Float32Regs:$a),
++          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
++def : Pat<(frint Float64Regs:$a),
++          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
++
++
+ //-----------------------------------
+ // Control-flow
+ //-----------------------------------
+Index: test/CodeGen/NVPTX/bug22322.ll
+===================================================================
+--- a/test/CodeGen/NVPTX/bug22322.ll
++++ b/test/CodeGen/NVPTX/bug22322.ll
+@@ -22,7 +22,7 @@
+   %8 = icmp eq i32 %7, 0
+   %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00
+   store float %9, float* %ret_vec.sroa.8.i, align 4
+-; CHECK: setp.lt.f32     %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
++; CHECK: max.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000
+   %10 = fcmp olt float %9, 0.000000e+00
+   %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4
+   %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val
+Index: test/CodeGen/NVPTX/math-intrins.ll
+===================================================================
+--- a/test/CodeGen/NVPTX/math-intrins.ll
++++ b/test/CodeGen/NVPTX/math-intrins.ll
+@@ -0,0 +1,261 @@
++; RUN: llc < %s | FileCheck %s
++target triple = "nvptx64-nvidia-cuda"
++
++; Checks that llvm intrinsics for math functions are correctly lowered to PTX.
++
++declare float @llvm.ceil.f32(float) #0
++declare double @llvm.ceil.f64(double) #0
++declare float @llvm.floor.f32(float) #0
++declare double @llvm.floor.f64(double) #0
++declare float @llvm.round.f32(float) #0
++declare double @llvm.round.f64(double) #0
++declare float @llvm.nearbyint.f32(float) #0
++declare double @llvm.nearbyint.f64(double) #0
++declare float @llvm.rint.f32(float) #0
++declare double @llvm.rint.f64(double) #0
++declare float @llvm.trunc.f32(float) #0
++declare double @llvm.trunc.f64(double) #0
++declare float @llvm.fabs.f32(float) #0
++declare double @llvm.fabs.f64(double) #0
++declare float @llvm.minnum.f32(float, float) #0
++declare double @llvm.minnum.f64(double, double) #0
++declare float @llvm.maxnum.f32(float, float) #0
++declare double @llvm.maxnum.f64(double, double) #0
++
++; ---- ceil ----
++
++; CHECK-LABEL: ceil_float
++define float @ceil_float(float %a) {
++  ; CHECK: cvt.rpi.f32.f32
++  %b = call float @llvm.ceil.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: ceil_float_ftz
++define float @ceil_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rpi.ftz.f32.f32
++  %b = call float @llvm.ceil.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: ceil_double
++define double @ceil_double(double %a) {
++  ; CHECK: cvt.rpi.f64.f64
++  %b = call double @llvm.ceil.f64(double %a)
++  ret double %b
++}
++
++; ---- floor ----
++
++; CHECK-LABEL: floor_float
++define float @floor_float(float %a) {
++  ; CHECK: cvt.rmi.f32.f32
++  %b = call float @llvm.floor.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: floor_float_ftz
++define float @floor_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rmi.ftz.f32.f32
++  %b = call float @llvm.floor.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: floor_double
++define double @floor_double(double %a) {
++  ; CHECK: cvt.rmi.f64.f64
++  %b = call double @llvm.floor.f64(double %a)
++  ret double %b
++}
++
++; ---- round ----
++
++; CHECK-LABEL: round_float
++define float @round_float(float %a) {
++  ; CHECK: cvt.rni.f32.f32
++  %b = call float @llvm.round.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: round_float_ftz
++define float @round_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rni.ftz.f32.f32
++  %b = call float @llvm.round.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: round_double
++define double @round_double(double %a) {
++  ; CHECK: cvt.rni.f64.f64
++  %b = call double @llvm.round.f64(double %a)
++  ret double %b
++}
++
++; ---- nearbyint ----
++
++; CHECK-LABEL: nearbyint_float
++define float @nearbyint_float(float %a) {
++  ; CHECK: cvt.rni.f32.f32
++  %b = call float @llvm.nearbyint.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: nearbyint_float_ftz
++define float @nearbyint_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rni.ftz.f32.f32
++  %b = call float @llvm.nearbyint.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: nearbyint_double
++define double @nearbyint_double(double %a) {
++  ; CHECK: cvt.rni.f64.f64
++  %b = call double @llvm.nearbyint.f64(double %a)
++  ret double %b
++}
++
++; ---- rint ----
++
++; CHECK-LABEL: rint_float
++define float @rint_float(float %a) {
++  ; CHECK: cvt.rni.f32.f32
++  %b = call float @llvm.rint.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: rint_float_ftz
++define float @rint_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rni.ftz.f32.f32
++  %b = call float @llvm.rint.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: rint_double
++define double @rint_double(double %a) {
++  ; CHECK: cvt.rni.f64.f64
++  %b = call double @llvm.rint.f64(double %a)
++  ret double %b
++}
++
++; ---- trunc ----
++
++; CHECK-LABEL: trunc_float
++define float @trunc_float(float %a) {
++  ; CHECK: cvt.rzi.f32.f32
++  %b = call float @llvm.trunc.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: trunc_float_ftz
++define float @trunc_float_ftz(float %a) #1 {
++  ; CHECK: cvt.rzi.ftz.f32.f32
++  %b = call float @llvm.trunc.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: trunc_double
++define double @trunc_double(double %a) {
++  ; CHECK: cvt.rzi.f64.f64
++  %b = call double @llvm.trunc.f64(double %a)
++  ret double %b
++}
++
++; ---- abs ----
++
++; CHECK-LABEL: abs_float
++define float @abs_float(float %a) {
++  ; CHECK: abs.f32
++  %b = call float @llvm.fabs.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: abs_float_ftz
++define float @abs_float_ftz(float %a) #1 {
++  ; CHECK: abs.ftz.f32
++  %b = call float @llvm.fabs.f32(float %a)
++  ret float %b
++}
++
++; CHECK-LABEL: abs_double
++define double @abs_double(double %a) {
++  ; CHECK: abs.f64
++  %b = call double @llvm.fabs.f64(double %a)
++  ret double %b
++}
++
++; ---- min ----
++
++; CHECK-LABEL: min_float
++define float @min_float(float %a, float %b) {
++  ; CHECK: min.f32
++  %x = call float @llvm.minnum.f32(float %a, float %b)
++  ret float %x
++}
++
++; CHECK-LABEL: min_imm1
++define float @min_imm1(float %a) {
++  ; CHECK: min.f32
++  %x = call float @llvm.minnum.f32(float %a, float 0.0)
++  ret float %x
++}
++
++; CHECK-LABEL: min_imm2
++define float @min_imm2(float %a) {
++  ; CHECK: min.f32
++  %x = call float @llvm.minnum.f32(float 0.0, float %a)
++  ret float %x
++}
++
++; CHECK-LABEL: min_float_ftz
++define float @min_float_ftz(float %a, float %b) #1 {
++  ; CHECK: min.ftz.f32
++  %x = call float @llvm.minnum.f32(float %a, float %b)
++  ret float %x
++}
++
++; CHECK-LABEL: min_double
++define double @min_double(double %a, double %b) {
++  ; CHECK: min.f64
++  %x = call double @llvm.minnum.f64(double %a, double %b)
++  ret double %x
++}
++
++; ---- max ----
++
++; CHECK-LABEL: max_imm1
++define float @max_imm1(float %a) {
++  ; CHECK: max.f32
++  %x = call float @llvm.maxnum.f32(float %a, float 0.0)
++  ret float %x
++}
++
++; CHECK-LABEL: max_imm2
++define float @max_imm2(float %a) {
++  ; CHECK: max.f32
++  %x = call float @llvm.maxnum.f32(float 0.0, float %a)
++  ret float %x
++}
++
++; CHECK-LABEL: max_float
++define float @max_float(float %a, float %b) {
++  ; CHECK: max.f32
++  %x = call float @llvm.maxnum.f32(float %a, float %b)
++  ret float %x
++}
++
++; CHECK-LABEL: max_float_ftz
++define float @max_float_ftz(float %a, float %b) #1 {
++  ; CHECK: max.ftz.f32
++  %x = call float @llvm.maxnum.f32(float %a, float %b)
++  ret float %x
++}
++
++; CHECK-LABEL: max_double
++define double @max_double(double %a, double %b) {
++  ; CHECK: max.f64
++  %x = call double @llvm.maxnum.f64(double %a, double %b)
++  ret double %x
++}
++
++attributes #0 = { nounwind readnone }
++attributes #1 = { "nvptx-f32ftz" = "true" }
diff --git a/deps/patches/llvm-D9168_argument_alignment.patch b/deps/patches/llvm-D9168_argument_alignment.patch
new file mode 100644
index 0000000000000..8166cc379f419
--- /dev/null
+++ b/deps/patches/llvm-D9168_argument_alignment.patch
@@ -0,0 +1,98 @@
+Index: lib/Target/NVPTX/NVPTXISelLowering.h
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXISelLowering.h
++++ b/lib/Target/NVPTX/NVPTXISelLowering.h
+@@ -539,7 +539,8 @@
+   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ 
+   unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+-                                Type *Ty, unsigned Idx) const;
++                                Type *Ty, unsigned Idx,
++                                const DataLayout &DL) const;
+ };
+ } // namespace llvm
+ 
+Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
+===================================================================
+--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
+@@ -1024,11 +1024,15 @@
+   return O.str();
+ }
+ 
+-unsigned
+-NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+-                                          const ImmutableCallSite *CS,
+-                                          Type *Ty,
+-                                          unsigned Idx) const {
++unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
++                                                   const ImmutableCallSite *CS,
++                                                   Type *Ty, unsigned Idx,
++                                                   const DataLayout &DL) const {
++  if (!CS) {
++    // CallSite is zero, fallback to ABI type alignment
++    return DL.getABITypeAlignment(Ty);
++  }
++
+   unsigned Align = 0;
+   const Value *DirectCallee = CS->getCalledFunction();
+ 
+@@ -1046,7 +1050,7 @@
+ 
+       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+       // Ignore any bitcast instructions
+-      while(isa<ConstantExpr>(CalleeV)) {
++      while (isa<ConstantExpr>(CalleeV)) {
+         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+         if (!CE->isCast())
+           break;
+@@ -1069,7 +1073,6 @@
+ 
+   // Call is indirect or alignment information is not available, fall back to
+   // the ABI type alignment
+-  auto &DL = CS->getCaller()->getParent()->getDataLayout();
+   return DL.getABITypeAlignment(Ty);
+ }
+ 
+@@ -1126,7 +1129,8 @@
+         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
+                            0);
+ 
+-        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
++        unsigned align =
++            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+         // declare .param .align <align> .b8 .param<n>[<size>];
+         unsigned sz = DL.getTypeAllocSize(Ty);
+         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+@@ -1166,7 +1170,8 @@
+       }
+       if (Ty->isVectorTy()) {
+         EVT ObjectVT = getValueType(DL, Ty);
+-        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
++        unsigned align =
++            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+         // declare .param .align <align> .b8 .param<n>[<size>];
+         unsigned sz = DL.getTypeAllocSize(Ty);
+         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+@@ -1426,7 +1431,7 @@
+                           DeclareRetOps);
+       InFlag = Chain.getValue(1);
+     } else {
+-      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
++      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+       SDValue DeclareRetOps[] = { Chain,
+                                   DAG.getConstant(retAlignment, dl, MVT::i32),
+@@ -1633,9 +1638,10 @@
+     } else {
+       SmallVector<EVT, 16> VTs;
+       SmallVector<uint64_t, 16> Offsets;
+-      ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
++      auto &DL = DAG.getDataLayout();
++      ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
+       assert(VTs.size() == Ins.size() && "Bad value decomposition");
+-      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
++      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+         unsigned sz = VTs[i].getSizeInBits();
+         unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);