diff --git a/deps/llvm.mk b/deps/llvm.mk index e3c9b35626e18..37e3b1f62b52a 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -78,7 +78,11 @@ LLVM_CFLAGS += $(CFLAGS) LLVM_CXXFLAGS += $(CXXFLAGS) LLVM_CPPFLAGS += $(CPPFLAGS) LLVM_LDFLAGS += $(LDFLAGS) -LLVM_TARGETS := host +ifeq ($(LLVM_USE_CMAKE),1) +LLVM_TARGETS := host;NVPTX +else +LLVM_TARGETS := host,nvptx +endif LLVM_TARGET_FLAGS := --enable-targets=$(LLVM_TARGETS) LLVM_CMAKE += -DLLVM_TARGETS_TO_BUILD:STRING="$(LLVM_TARGETS)" -DCMAKE_BUILD_TYPE="$(LLVM_CMAKE_BUILDTYPE)" LLVM_CMAKE += -DLLVM_TOOLS_INSTALL_DIR=$(shell $(JULIAHOME)/contrib/relative_path.sh $(build_prefix) $(build_depsbindir)) @@ -479,6 +483,10 @@ $(eval $(call LLVM_PATCH,llvm-PR22923)) # Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-r282182)) # Remove for 4.0 $(eval $(call LLVM_PATCH,llvm-arm-fix-prel31)) $(eval $(call LLVM_PATCH,llvm-D25865-cmakeshlib)) +# patches for NVPTX +$(eval $(call LLVM_PATCH,llvm-D9168_argument_alignment)) # Remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-D23597_sdag_names)) # Dep for D24300, remove for 4.0 +$(eval $(call LLVM_PATCH,llvm-D24300_ptx_intrinsics)) # Remove for 4.0 endif # LLVM_VER ifeq ($(LLVM_VER),3.7.1) diff --git a/deps/patches/llvm-D23597_sdag_names.patch b/deps/patches/llvm-D23597_sdag_names.patch new file mode 100644 index 0000000000000..9eea510f7d62f --- /dev/null +++ b/deps/patches/llvm-D23597_sdag_names.patch @@ -0,0 +1,796 @@ +Index: include/llvm/Target/TargetSelectionDAG.td +=================================================================== +--- a/include/llvm/Target/TargetSelectionDAG.td ++++ b/include/llvm/Target/TargetSelectionDAG.td +@@ -450,10 +450,10 @@ + def fceil : SDNode<"ISD::FCEIL" , SDTFPUnaryOp>; + def ffloor : SDNode<"ISD::FFLOOR" , SDTFPUnaryOp>; + def fnearbyint : SDNode<"ISD::FNEARBYINT" , SDTFPUnaryOp>; +-def frnd : SDNode<"ISD::FROUND" , SDTFPUnaryOp>; ++def fround : SDNode<"ISD::FROUND" , SDTFPUnaryOp>; + +-def fround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>; +-def fextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>; ++def fpround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>; ++def fpextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>; + def fcopysign : SDNode<"ISD::FCOPYSIGN" , SDTFPSignOp>; + + def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>; +Index: lib/Target/AArch64/AArch64InstrFormats.td +=================================================================== +--- a/lib/Target/AArch64/AArch64InstrFormats.td ++++ b/lib/Target/AArch64/AArch64InstrFormats.td +@@ -3936,27 +3936,27 @@ + multiclass FPConversion { + // Double-precision to Half-precision + def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, +- [(set FPR16:$Rd, (fround FPR64:$Rn))]>; ++ [(set FPR16:$Rd, (fpround FPR64:$Rn))]>; + + // Double-precision to Single-precision + def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, +- [(set FPR32:$Rd, (fround FPR64:$Rn))]>; ++ [(set FPR32:$Rd, (fpround FPR64:$Rn))]>; + + // Half-precision to Double-precision + def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, +- [(set FPR64:$Rd, (fextend FPR16:$Rn))]>; ++ [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; + + // Half-precision to Single-precision + def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, +- [(set FPR32:$Rd, (fextend FPR16:$Rn))]>; ++ [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; + + // Single-precision to Double-precision + def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, +- [(set FPR64:$Rd, (fextend FPR32:$Rn))]>; ++ [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>; + + // Single-precision to Half-precision + def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, +- [(set FPR16:$Rd, (fround FPR32:$Rn))]>; ++ [(set FPR16:$Rd, (fpround FPR32:$Rn))]>; + } + + //--- +Index: lib/Target/AArch64/AArch64InstrInfo.td +=================================================================== +--- a/lib/Target/AArch64/AArch64InstrInfo.td ++++ b/lib/Target/AArch64/AArch64InstrInfo.td +@@ -2545,8 +2545,8 @@ + defm : FPToIntegerPats; + defm : FPToIntegerPats; + defm : FPToIntegerPats; +-defm : FPToIntegerPats; +-defm : FPToIntegerPats; ++defm : FPToIntegerPats; ++defm : FPToIntegerPats; + + //===----------------------------------------------------------------------===// + // Scaled integer to floating point conversion instructions. +@@ -2582,7 +2582,7 @@ + defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; + defm FMOV : SingleOperandFPData<0b0000, "fmov">; + defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; +-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>; ++defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>; + defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; + defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; + defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>; +@@ -2788,13 +2788,13 @@ + def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), + (i64 4)))), + (FCVTLv8i16 V128:$Rn)>; +-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; +-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), ++def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; ++def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn), + (i64 2))))), + (FCVTLv4i32 V128:$Rn)>; + +-def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +-def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn), ++def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; ++def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn), + (i64 4))))), + (FCVTLv8i16 V128:$Rn)>; + +@@ -2808,9 +2808,9 @@ + def : Pat<(concat_vectors V64:$Rd, + (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), + (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +-def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; +-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), ++def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; ++def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; ++def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))), + (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; + defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; + defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; +@@ -2833,7 +2833,7 @@ + + defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; + defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; +-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; ++defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>; + defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; + defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; + defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>; +Index: lib/Target/AMDGPU/SIInstructions.td +=================================================================== +--- a/lib/Target/AMDGPU/SIInstructions.td ++++ b/lib/Target/AMDGPU/SIInstructions.td +@@ -1107,10 +1107,10 @@ + VOP_I32_F32, cvt_flr_i32_f32>; + defm V_CVT_OFF_F32_I4 : VOP1Inst , "v_cvt_off_f32_i4", VOP_F32_I32>; + defm V_CVT_F32_F64 : VOP1Inst , "v_cvt_f32_f64", +- VOP_F32_F64, fround ++ VOP_F32_F64, fpround + >; + defm V_CVT_F64_F32 : VOP1Inst , "v_cvt_f64_f32", +- VOP_F64_F32, fextend ++ VOP_F64_F32, fpextend + >; + defm V_CVT_F32_UBYTE0 : VOP1Inst , "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 +Index: lib/Target/ARM/ARMInstrVFP.td +=================================================================== +--- a/lib/Target/ARM/ARMInstrVFP.td ++++ b/lib/Target/ARM/ARMInstrVFP.td +@@ -624,7 +624,7 @@ + def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", +- [(set DPR:$Dd, (fextend SPR:$Sm))]> { ++ [(set DPR:$Dd, (fpextend SPR:$Sm))]> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; +@@ -641,7 +641,7 @@ + // Special case encoding: bits 11-8 is 0b1011. + def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, + IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", +- [(set SPR:$Sd, (fround DPR:$Dm))]> { ++ [(set SPR:$Sd, (fpround DPR:$Dm))]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; +@@ -838,7 +838,7 @@ + } + } + +-defm VCVTA : vcvt_inst<"a", 0b00, frnd>; ++defm VCVTA : vcvt_inst<"a", 0b00, fround>; + defm VCVTN : vcvt_inst<"n", 0b01>; + defm VCVTP : vcvt_inst<"p", 0b10, fceil>; + defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; +@@ -938,7 +938,7 @@ + Requires<[HasFPARMv8,HasDPVFP]>; + } + +-defm VRINTA : vrint_inst_anpm<"a", 0b00, frnd>; ++defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>; + defm VRINTN : vrint_inst_anpm<"n", 0b01>; + defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; + defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; +Index: lib/Target/Hexagon/HexagonISelLowering.cpp +=================================================================== +--- a/lib/Target/Hexagon/HexagonISelLowering.cpp ++++ b/lib/Target/Hexagon/HexagonISelLowering.cpp +@@ -1906,7 +1906,7 @@ + } + // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); +- // Turn FP extload into load/fextend. ++ // Turn FP extload into load/fpextend. + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + +Index: lib/Target/Hexagon/HexagonInstrInfoV5.td +=================================================================== +--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td ++++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td +@@ -564,10 +564,10 @@ + + // Convert single precision to double precision and vice-versa. + def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000, +- fextend, F64, F32>; ++ fpextend, F64, F32>; + + def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000, +- fround, F32, F64>; ++ fpround, F32, F64>; + + // Convert Integer to Floating Point. + def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010, +Index: lib/Target/Mips/MipsInstrFPU.td +=================================================================== +--- a/lib/Target/Mips/MipsInstrFPU.td ++++ b/lib/Target/Mips/MipsInstrFPU.td +@@ -635,9 +635,9 @@ + (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32; + def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src), + (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32; +-def : MipsPat<(f32 (fround AFGR64Opnd:$src)), ++def : MipsPat<(f32 (fpround AFGR64Opnd:$src)), + (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32; +-def : MipsPat<(f64 (fextend FGR32Opnd:$src)), ++def : MipsPat<(f64 (fpextend FGR32Opnd:$src)), + (CVT_D32_S FGR32Opnd:$src)>, FGR_32; + + def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64; +@@ -657,9 +657,9 @@ + def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src), + (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64; + +-def : MipsPat<(f32 (fround FGR64Opnd:$src)), ++def : MipsPat<(f32 (fpround FGR64Opnd:$src)), + (CVT_S_D64 FGR64Opnd:$src)>, FGR_64; +-def : MipsPat<(f64 (fextend FGR32Opnd:$src)), ++def : MipsPat<(f64 (fpextend FGR32Opnd:$src)), + (CVT_D64_S FGR32Opnd:$src)>, FGR_64; + + // Patterns for loads/stores with a reg+imm operand. +Index: lib/Target/NVPTX/NVPTXISelLowering.cpp +=================================================================== +--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp ++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp +@@ -206,7 +206,7 @@ + // intrinsics. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + +- // Turn FP extload into load/fextend ++ // Turn FP extload into load/fpextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); +Index: lib/Target/NVPTX/NVPTXInstrInfo.td +=================================================================== +--- a/lib/Target/NVPTX/NVPTXInstrInfo.td ++++ b/lib/Target/NVPTX/NVPTXInstrInfo.td +@@ -2613,16 +2613,16 @@ + def : Pat<(ctpop Int16Regs:$a), + (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; + +-// fround f64 -> f32 +-def : Pat<(f32 (fround Float64Regs:$a)), ++// fpround f64 -> f32 ++def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +-def : Pat<(f32 (fround Float64Regs:$a)), ++def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN)>; + +-// fextend f32 -> f64 +-def : Pat<(f64 (fextend Float32Regs:$a)), ++// fpextend f32 -> f64 ++def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +-def : Pat<(f64 (fextend Float32Regs:$a)), ++def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; + + def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, +Index: lib/Target/PowerPC/PPCInstrInfo.td +=================================================================== +--- a/lib/Target/PowerPC/PPCInstrInfo.td ++++ b/lib/Target/PowerPC/PPCInstrInfo.td +@@ -2110,15 +2110,15 @@ + + defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), + "frsp", "$frD, $frB", IIC_FPGeneral, +- [(set f32:$frD, (fround f64:$frB))]>; ++ [(set f32:$frD, (fpround f64:$frB))]>; + + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, +- [(set f64:$frD, (frnd f64:$frB))]>; ++ [(set f64:$frD, (fround f64:$frB))]>; + defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, +- [(set f32:$frD, (frnd f32:$frB))]>; ++ [(set f32:$frD, (fround f32:$frB))]>; + } + + let hasSideEffects = 0 in { +@@ -2856,7 +2856,7 @@ + def : Pat<(f64 (extloadf32 xaddr:$src)), + (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>; + +-def : Pat<(f64 (fextend f32:$src)), ++def : Pat<(f64 (fpextend f32:$src)), + (COPY_TO_REGCLASS $src, F8RC)>; + + // Only seq_cst fences require the heavyweight sync (SYNC 0). +Index: lib/Target/PowerPC/PPCInstrQPX.td +=================================================================== +--- a/lib/Target/PowerPC/PPCInstrQPX.td ++++ b/lib/Target/PowerPC/PPCInstrQPX.td +@@ -88,11 +88,11 @@ + return cast(N)->getMemoryVT() == MVT::v4f32; + }]>; + +-def fround_inexact : PatFrag<(ops node:$val), (fround node:$val), [{ ++def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ + return cast(N->getOperand(1))->getZExtValue() == 0; + }]>; + +-def fround_exact : PatFrag<(ops node:$val), (fround node:$val), [{ ++def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ + return cast(N->getOperand(1))->getZExtValue() == 1; + }]>; + +@@ -311,11 +311,11 @@ + + def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, +- [(set v4f64:$FRT, (frnd v4f64:$FRB))]>; ++ [(set v4f64:$FRT, (fround v4f64:$FRB))]>; + let isCodeGenOnly = 1 in + def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), + "qvfrin $FRT, $FRB", IIC_FPGeneral, +- [(set v4f32:$FRT, (frnd v4f32:$FRB))]>; ++ [(set v4f32:$FRT, (fround v4f32:$FRB))]>; + + def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), + "qvfrip $FRT, $FRB", IIC_FPGeneral, +@@ -1103,7 +1103,7 @@ + def : Pat<(not v4i1:$FRA), + (QVFLOGICALb $FRA, $FRA, (i32 10))>; + +-def : Pat<(v4f64 (fextend v4f32:$src)), ++def : Pat<(v4f64 (fpextend v4f32:$src)), + (COPY_TO_REGCLASS $src, QFRC)>; + + def : Pat<(v4f32 (fround_exact v4f64:$src)), +Index: lib/Target/PowerPC/PPCInstrVSX.td +=================================================================== +--- a/lib/Target/PowerPC/PPCInstrVSX.td ++++ b/lib/Target/PowerPC/PPCInstrVSX.td +@@ -634,7 +634,7 @@ + def XSRDPI : XX2Form<60, 73, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpi $XT, $XB", IIC_VecFP, +- [(set f64:$XT, (frnd f64:$XB))]>; ++ [(set f64:$XT, (fround f64:$XB))]>; + def XSRDPIC : XX2Form<60, 107, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpic $XT, $XB", IIC_VecFP, +@@ -655,7 +655,7 @@ + def XVRDPI : XX2Form<60, 201, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpi $XT, $XB", IIC_VecFP, +- [(set v2f64:$XT, (frnd v2f64:$XB))]>; ++ [(set v2f64:$XT, (fround v2f64:$XB))]>; + def XVRDPIC : XX2Form<60, 235, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpic $XT, $XB", IIC_VecFP, +@@ -676,7 +676,7 @@ + def XVRSPI : XX2Form<60, 137, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspi $XT, $XB", IIC_VecFP, +- [(set v4f32:$XT, (frnd v4f32:$XB))]>; ++ [(set v4f32:$XT, (fround v4f32:$XB))]>; + def XVRSPIC : XX2Form<60, 171, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspic $XT, $XB", IIC_VecFP, +@@ -1108,7 +1108,7 @@ + + def : Pat<(f64 (extloadf32 xoaddr:$src)), + (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>; +- def : Pat<(f64 (fextend f32:$src)), ++ def : Pat<(f64 (fpextend f32:$src)), + (COPY_TO_REGCLASS $src, VSFRC)>; + + def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), +Index: lib/Target/Sparc/SparcISelLowering.cpp +=================================================================== +--- a/lib/Target/Sparc/SparcISelLowering.cpp ++++ b/lib/Target/Sparc/SparcISelLowering.cpp +@@ -1508,7 +1508,7 @@ + // AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + } + +- // Turn FP extload into load/fextend ++ // Turn FP extload into load/fpextend + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); +Index: lib/Target/Sparc/SparcInstrInfo.td +=================================================================== +--- a/lib/Target/Sparc/SparcInstrInfo.td ++++ b/lib/Target/Sparc/SparcInstrInfo.td +@@ -1131,32 +1131,32 @@ + def FSTOD : F3_3u<2, 0b110100, 0b011001001, + (outs DFPRegs:$rd), (ins FPRegs:$rs2), + "fstod $rs2, $rd", +- [(set f64:$rd, (fextend f32:$rs2))], ++ [(set f64:$rd, (fpextend f32:$rs2))], + IIC_fpu_stod>; + def FSTOQ : F3_3u<2, 0b110100, 0b011001101, + (outs QFPRegs:$rd), (ins FPRegs:$rs2), + "fstoq $rs2, $rd", +- [(set f128:$rd, (fextend f32:$rs2))]>, ++ [(set f128:$rd, (fpextend f32:$rs2))]>, + Requires<[HasHardQuad]>; + def FDTOS : F3_3u<2, 0b110100, 0b011000110, + (outs FPRegs:$rd), (ins DFPRegs:$rs2), + "fdtos $rs2, $rd", +- [(set f32:$rd, (fround f64:$rs2))], ++ [(set f32:$rd, (fpround f64:$rs2))], + IIC_fpu_fast_instr>; + def FDTOQ : F3_3u<2, 0b110100, 0b011001110, + (outs QFPRegs:$rd), (ins DFPRegs:$rs2), + "fdtoq $rs2, $rd", +- [(set f128:$rd, (fextend f64:$rs2))]>, ++ [(set f128:$rd, (fpextend f64:$rs2))]>, + Requires<[HasHardQuad]>; + def FQTOS : F3_3u<2, 0b110100, 0b011000111, + (outs FPRegs:$rd), (ins QFPRegs:$rs2), + "fqtos $rs2, $rd", +- [(set f32:$rd, (fround f128:$rs2))]>, ++ [(set f32:$rd, (fpround f128:$rs2))]>, + Requires<[HasHardQuad]>; + def FQTOD : F3_3u<2, 0b110100, 0b011001011, + (outs DFPRegs:$rd), (ins QFPRegs:$rs2), + "fqtod $rs2, $rd", +- [(set f64:$rd, (fround f128:$rs2))]>, ++ [(set f64:$rd, (fpround f128:$rs2))]>, + Requires<[HasHardQuad]>; + + // Floating-point Move Instructions, p. 144 +@@ -1255,14 +1255,14 @@ + def FSMULD : F3_3<2, 0b110100, 0b001101001, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fsmuld $rs1, $rs2, $rd", +- [(set f64:$rd, (fmul (fextend f32:$rs1), +- (fextend f32:$rs2)))], ++ [(set f64:$rd, (fmul (fpextend f32:$rs1), ++ (fpextend f32:$rs2)))], + IIC_fpu_muld>; + def FDMULQ : F3_3<2, 0b110100, 0b001101110, + (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "fdmulq $rs1, $rs2, $rd", +- [(set f128:$rd, (fmul (fextend f64:$rs1), +- (fextend f64:$rs2)))]>, ++ [(set f128:$rd, (fmul (fpextend f64:$rs1), ++ (fpextend f64:$rs2)))]>, + Requires<[HasHardQuad]>; + + // FDIVS generates an erratum on LEON processors, so by disabling this instruction +Index: lib/Target/SystemZ/SystemZISelLowering.cpp +=================================================================== +--- a/lib/Target/SystemZ/SystemZISelLowering.cpp ++++ b/lib/Target/SystemZ/SystemZISelLowering.cpp +@@ -4995,8 +4995,8 @@ + + SDValue SystemZTargetLowering::combineFP_ROUND( + SDNode *N, DAGCombinerInfo &DCI) const { +- // (fround (extract_vector_elt X 0)) +- // (fround (extract_vector_elt X 1)) -> ++ // (fpround (extract_vector_elt X 0)) ++ // (fpround (extract_vector_elt X 1)) -> + // (extract_vector_elt (VROUND X) 0) + // (extract_vector_elt (VROUND X) 1) + // +Index: lib/Target/SystemZ/SystemZInstrFP.td +=================================================================== +--- a/lib/Target/SystemZ/SystemZInstrFP.td ++++ b/lib/Target/SystemZ/SystemZInstrFP.td +@@ -154,7 +154,7 @@ + // Convert floating-point values to narrower representations, rounding + // according to the current mode. The destination of LEXBR and LDXBR + // is a 128-bit value, but only the first register of the pair is used. +-def LEDBR : UnaryRRE<"ledb", 0xB344, fround, FP32, FP64>; ++def LEDBR : UnaryRRE<"ledb", 0xB344, fpround, FP32, FP64>; + def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>; + def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>; + +@@ -165,15 +165,15 @@ + def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>, + Requires<[FeatureFPExtension]>; + +-def : Pat<(f32 (fround FP128:$src)), ++def : Pat<(f32 (fpround FP128:$src)), + (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>; +-def : Pat<(f64 (fround FP128:$src)), ++def : Pat<(f64 (fpround FP128:$src)), + (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; + + // Extend register floating-point values to wider representations. +-def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64, FP32>; +-def LXEBR : UnaryRRE<"lxeb", 0xB306, fextend, FP128, FP32>; +-def LXDBR : UnaryRRE<"lxdb", 0xB305, fextend, FP128, FP64>; ++def LDEBR : UnaryRRE<"ldeb", 0xB304, fpextend, FP64, FP32>; ++def LXEBR : UnaryRRE<"lxeb", 0xB306, fpextend, FP128, FP32>; ++def LXDBR : UnaryRRE<"lxdb", 0xB305, fpextend, FP128, FP64>; + + // Extend memory floating-point values to wider representations. + def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>; +@@ -347,9 +347,9 @@ + + // Same idea for round, where mode 1 is round towards nearest with + // ties away from zero. +- def : Pat<(frnd FP32:$src), (FIEBRA 1, FP32:$src, 4)>; +- def : Pat<(frnd FP64:$src), (FIDBRA 1, FP64:$src, 4)>; +- def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>; ++ def : Pat<(fround FP32:$src), (FIEBRA 1, FP32:$src, 4)>; ++ def : Pat<(fround FP64:$src), (FIDBRA 1, FP64:$src, 4)>; ++ def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>; + } + + //===----------------------------------------------------------------------===// +@@ -388,26 +388,26 @@ + + // f64 multiplication of two FP32 registers. + def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>; +-def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))), ++def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))), + (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + FP32:$src1, subreg_r32), FP32:$src2)>; + + // f64 multiplication of an FP32 register and an f32 memory. + def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>; +-def : Pat<(fmul (f64 (fextend FP32:$src1)), ++def : Pat<(fmul (f64 (fpextend FP32:$src1)), + (f64 (extloadf32 bdxaddr12only:$addr))), + (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_r32), + bdxaddr12only:$addr)>; + + // f128 multiplication of two FP64 registers. + def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>; +-def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))), ++def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), + (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), + FP64:$src1, subreg_h64), FP64:$src2)>; + + // f128 multiplication of an FP64 register and an f64 memory. + def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; +-def : Pat<(fmul (f128 (fextend FP64:$src1)), ++def : Pat<(fmul (f128 (fpextend FP64:$src1)), + (f128 (extloadf64 bdxaddr12only:$addr))), + (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), + bdxaddr12only:$addr)>; +Index: lib/Target/SystemZ/SystemZInstrVector.td +=================================================================== +--- a/lib/Target/SystemZ/SystemZInstrVector.td ++++ b/lib/Target/SystemZ/SystemZInstrVector.td +@@ -798,7 +798,7 @@ + def : FPConversion; + def : FPConversion; + def : FPConversion; +- def : FPConversion; ++ def : FPConversion; + } + + let Predicates = [FeatureVector] in { +@@ -840,13 +840,13 @@ + + // Load lengthened. + def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>; +- def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>; ++ def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>; + + // Load rounded, + def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>; + def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>; + def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>; +- def : FPConversion; ++ def : FPConversion; + + // Multiply. + def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>; +Index: lib/Target/WebAssembly/WebAssemblyInstrConv.td +=================================================================== +--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td ++++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td +@@ -89,10 +89,10 @@ + "f64.convert_u/i64\t$dst, $src">; + + def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src), +- [(set F64:$dst, (fextend F32:$src))], ++ [(set F64:$dst, (fpextend F32:$src))], + "f64.promote/f32\t$dst, $src">; + def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src), +- [(set F32:$dst, (fround F64:$src))], ++ [(set F32:$dst, (fpround F64:$src))], + "f32.demote/f64\t$dst, $src">; + + def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src), +Index: lib/Target/X86/X86InstrAVX512.td +=================================================================== +--- a/lib/Target/X86/X86InstrAVX512.td ++++ b/lib/Target/X86/X86InstrAVX512.td +@@ -5595,11 +5595,11 @@ + defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, + X86fpextRnd,f32x_info, f64x_info >; + +-def : Pat<(f64 (fextend FR32X:$src)), ++def : Pat<(f64 (fpextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), + (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +-def : Pat<(f64 (fextend (loadf32 addr:$src))), ++def : Pat<(f64 (fpextend (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512]>; + +@@ -5612,7 +5612,7 @@ + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + Requires<[HasAVX512, OptForSpeed]>; + +-def : Pat<(f32 (fround FR64X:$src)), ++def : Pat<(f32 (fpround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), + (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +@@ -5666,29 +5666,29 @@ + // Extend Float to Double + multiclass avx512_cvtps2pd opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { +- defm Z : avx512_vcvt_fp, ++ defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_sae, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; +- defm Z256 : avx512_vcvt_fp, ++ defm Z256 : avx512_vcvt_fp, + EVEX_V256; + } + } + + // Truncate Double to Float + multiclass avx512_cvtpd2ps opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { +- defm Z : avx512_vcvt_fp, ++ defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; +- defm Z256 : avx512_vcvt_fp, EVEX_V256; + } + } +@@ -6025,7 +6025,7 @@ + } + + let Predicates = [HasAVX512] in { +- def : Pat<(v8f32 (fround (loadv8f64 addr:$src))), ++ def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; +Index: lib/Target/X86/X86InstrFPStack.td +=================================================================== +--- a/lib/Target/X86/X86InstrFPStack.td ++++ b/lib/Target/X86/X86InstrFPStack.td +@@ -711,19 +711,19 @@ + + // FP extensions map onto simple pseudo-value conversions if they are to/from + // the FP stack. +-def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, ++def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, + Requires<[FPStackf32]>; +-def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, ++def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, + Requires<[FPStackf32]>; +-def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, ++def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, + Requires<[FPStackf64]>; + + // FP truncations map onto simple pseudo-value conversions if they are to/from + // the FP stack. We have validated that only value-preserving truncations make + // it through isel. +-def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, ++def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, + Requires<[FPStackf32]>; +-def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, ++def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, + Requires<[FPStackf32]>; +-def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, ++def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, + Requires<[FPStackf64]>; +Index: lib/Target/X86/X86InstrSSE.td +=================================================================== +--- a/lib/Target/X86/X86InstrSSE.td ++++ b/lib/Target/X86/X86InstrSSE.td +@@ -1799,16 +1799,16 @@ + Sched<[WriteCvtF2FLd, ReadAfterLd]>; + } + +-def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, ++def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, + Requires<[UseAVX]>; + + def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", +- [(set FR32:$dst, (fround FR64:$src))], ++ [(set FR32:$dst, (fpround FR64:$src))], + IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; + def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", +- [(set FR32:$dst, (fround (loadf64 addr:$src)))], ++ [(set FR32:$dst, (fpround (loadf64 addr:$src)))], + IIC_SSE_CVT_Scalar_RM>, + XD, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; +@@ -1865,9 +1865,9 @@ + Sched<[WriteCvtF2FLd, ReadAfterLd]>; + } + +-def : Pat<(f64 (fextend FR32:$src)), ++def : Pat<(f64 (fpextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; +-def : Pat<(fextend (loadf32 addr:$src)), ++def : Pat<(fpextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; + + def : Pat<(extloadf32 addr:$src), +@@ -1879,21 +1879,21 @@ + + def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", +- [(set FR64:$dst, (fextend FR32:$src))], ++ [(set FR64:$dst, (fpextend FR32:$src))], + IIC_SSE_CVT_Scalar_RR>, XS, + Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; + def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))], + IIC_SSE_CVT_Scalar_RM>, XS, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; + +-// extload f32 -> f64. This matches load+fextend because we have a hack in ++// extload f32 -> f64. This matches load+fpextend because we have a hack in + // the isel (PreprocessForFPConvert) that can introduce loads after dag + // combine. +-// Since these loads aren't folded into the fextend, we have to match it ++// Since these loads aren't folded into the fpextend, we have to match it + // explicitly here. +-def : Pat<(fextend (loadf32 addr:$src)), ++def : Pat<(fpextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; + def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; +@@ -2269,26 +2269,26 @@ + } + + let Predicates = [HasAVX, NoVLX] in { +- // Match fround and fextend for 128/256-bit conversions ++ // Match fpround and fpextend for 128/256-bit conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), + (VCVTPD2PSXrm addr:$src)>; +- def : Pat<(v4f32 (fround (v4f64 VR256:$src))), ++ def : Pat<(v4f32 (fpround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; +- def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), ++ def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; + + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (VCVTPS2PDrr VR128:$src)>; +- def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), ++ def : Pat<(v4f64 (fpextend (v4f32 VR128:$src))), + (VCVTPS2PDYrr VR128:$src)>; + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDYrm addr:$src)>; + } + + let Predicates = [UseSSE2] in { +- // Match fround and fextend for 128 conversions ++ // Match fpround and fpextend for 128 conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), diff --git a/deps/patches/llvm-D24300_ptx_intrinsics.patch b/deps/patches/llvm-D24300_ptx_intrinsics.patch new file mode 100644 index 0000000000000..e0c1e5a286c56 --- /dev/null +++ b/deps/patches/llvm-D24300_ptx_intrinsics.patch @@ -0,0 +1,506 @@ +Index: lib/Target/NVPTX/NVPTXISelLowering.cpp +=================================================================== +--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp ++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp +@@ -279,6 +279,28 @@ + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SELECT); + ++ // Library functions. These default to Expand, but we have instructions ++ // for them. ++ setOperationAction(ISD::FCEIL, MVT::f32, Legal); ++ setOperationAction(ISD::FCEIL, MVT::f64, Legal); ++ setOperationAction(ISD::FFLOOR, MVT::f32, Legal); ++ setOperationAction(ISD::FFLOOR, MVT::f64, Legal); ++ setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); ++ setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); ++ setOperationAction(ISD::FRINT, MVT::f32, Legal); ++ setOperationAction(ISD::FRINT, MVT::f64, Legal); ++ setOperationAction(ISD::FROUND, MVT::f32, Legal); ++ setOperationAction(ISD::FROUND, MVT::f64, Legal); ++ setOperationAction(ISD::FTRUNC, MVT::f32, Legal); ++ setOperationAction(ISD::FTRUNC, MVT::f64, Legal); ++ setOperationAction(ISD::FMINNUM, MVT::f32, Legal); ++ setOperationAction(ISD::FMINNUM, MVT::f64, Legal); ++ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); ++ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); ++ ++ // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. ++ // No FPOW or FREM in PTX. ++ + // Now deduce the information based on the above mentioned + // actions + computeRegisterProperties(STI.getRegisterInfo()); +Index: lib/Target/NVPTX/NVPTXInstrInfo.td +=================================================================== +--- a/lib/Target/NVPTX/NVPTXInstrInfo.td ++++ b/lib/Target/NVPTX/NVPTXInstrInfo.td +@@ -207,15 +207,63 @@ + } + + // Template for instructions which take three fp64 or fp32 args. The +-// instructions are named ".f" (e.g. "add.f64"). ++// instructions are named ".f" (e.g. "min.f64"). + // + // Also defines ftz (flush subnormal inputs and results to sign-preserving + // zero) variants for fp32 functions. ++// ++// This multiclass should be used for nodes that cannot be folded into FMAs. ++// For nodes that can be folded into FMAs (i.e. adds and muls), use ++// F3_fma_component. + multiclass F3 { + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), ++ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; ++ def f64ri : ++ NVPTXInst<(outs Float64Regs:$dst), ++ (ins Float64Regs:$a, f64imm:$b), ++ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), ++ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; ++ def f32rr_ftz : ++ NVPTXInst<(outs Float32Regs:$dst), ++ (ins Float32Regs:$a, Float32Regs:$b), ++ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), ++ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, ++ Requires<[doF32FTZ]>; ++ def f32ri_ftz : ++ NVPTXInst<(outs Float32Regs:$dst), ++ (ins Float32Regs:$a, f32imm:$b), ++ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), ++ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, ++ Requires<[doF32FTZ]>; ++ def f32rr : ++ NVPTXInst<(outs Float32Regs:$dst), ++ (ins Float32Regs:$a, Float32Regs:$b), ++ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), ++ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; ++ def f32ri : ++ NVPTXInst<(outs Float32Regs:$dst), ++ (ins Float32Regs:$a, f32imm:$b), ++ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), ++ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; ++} ++ ++// Template for instructions which take three fp64 or fp32 args. The ++// instructions are named ".f" (e.g. "add.f64"). ++// ++// Also defines ftz (flush subnormal inputs and results to sign-preserving ++// zero) variants for fp32 functions. ++// ++// This multiclass should be used for nodes that can be folded to make fma ops. ++// In this case, we use the ".rn" variant when FMA is disabled, as this behaves ++// just like the non ".rn" op, but prevents ptxas from creating FMAs. ++multiclass F3_fma_component { ++ def f64rr : ++ NVPTXInst<(outs Float64Regs:$dst), ++ (ins Float64Regs:$a, Float64Regs:$b), ++ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[allowFMA]>; + def f64ri : +@@ -248,41 +296,39 @@ + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; +-} + +-// Same as F3, but defines ".rn" variants (round to nearest even). +-multiclass F3_rn { +- def f64rr : ++ // These have strange names so we don't perturb existing mir tests. ++ def _rnf64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; +- def f64ri : ++ def _rnf64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; +- def f32rr_ftz : ++ def _rnf32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA, doF32FTZ]>; +- def f32ri_ftz : ++ def _rnf32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA, doF32FTZ]>; +- def f32rr : ++ def _rnf32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; +- def f32ri : ++ def _rnf32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), +@@ -713,13 +759,12 @@ + N->getValueAPF().convertToDouble() == 1.0; + }]>; + +-defm FADD : F3<"add", fadd>; +-defm FSUB : F3<"sub", fsub>; +-defm FMUL : F3<"mul", fmul>; +- +-defm FADD_rn : F3_rn<"add", fadd>; +-defm FSUB_rn : F3_rn<"sub", fsub>; +-defm FMUL_rn : F3_rn<"mul", fmul>; ++defm FADD : F3_fma_component<"add", fadd>; ++defm FSUB : F3_fma_component<"sub", fsub>; ++defm FMUL : F3_fma_component<"mul", fmul>; ++ ++defm FMIN : F3<"min", fminnum>; ++defm FMAX : F3<"max", fmaxnum>; + + defm FABS : F2<"abs", fabs>; + defm FNEG : F2<"neg", fneg>; +@@ -2628,6 +2673,55 @@ + def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + ++// fceil, ffloor, fround, ftrunc. ++ ++def : Pat<(fceil Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(fceil Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(fceil Float64Regs:$a), ++ (CVT_f64_f64 Float64Regs:$a, CvtRPI)>; ++ ++def : Pat<(ffloor Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(ffloor Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(ffloor Float64Regs:$a), ++ (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; ++ ++def : Pat<(fround Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(f32 (fround Float32Regs:$a)), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(f64 (fround Float64Regs:$a)), ++ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; ++ ++def : Pat<(ftrunc Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(ftrunc Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(ftrunc Float64Regs:$a), ++ (CVT_f64_f64 Float64Regs:$a, CvtRZI)>; ++ ++// nearbyint and rint are implemented as rounding to nearest even. This isn't ++// strictly correct, because it causes us to ignore the rounding mode. But it ++// matches what CUDA's "libm" does. ++ ++def : Pat<(fnearbyint Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(fnearbyint Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(fnearbyint Float64Regs:$a), ++ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; ++ ++def : Pat<(frint Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; ++def : Pat<(frint Float32Regs:$a), ++ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; ++def : Pat<(frint Float64Regs:$a), ++ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; ++ ++ + //----------------------------------- + // Control-flow + //----------------------------------- +Index: test/CodeGen/NVPTX/bug22322.ll +=================================================================== +--- a/test/CodeGen/NVPTX/bug22322.ll ++++ b/test/CodeGen/NVPTX/bug22322.ll +@@ -22,7 +22,7 @@ + %8 = icmp eq i32 %7, 0 + %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00 + store float %9, float* %ret_vec.sroa.8.i, align 4 +-; CHECK: setp.lt.f32 %p{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000 ++; CHECK: max.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, 0f00000000 + %10 = fcmp olt float %9, 0.000000e+00 + %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4 + %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val +Index: test/CodeGen/NVPTX/math-intrins.ll +=================================================================== +--- a/test/CodeGen/NVPTX/math-intrins.ll ++++ b/test/CodeGen/NVPTX/math-intrins.ll +@@ -0,0 +1,261 @@ ++; RUN: llc < %s | FileCheck %s ++target triple = "nvptx64-nvidia-cuda" ++ ++; Checks that llvm intrinsics for math functions are correctly lowered to PTX. ++ ++declare float @llvm.ceil.f32(float) #0 ++declare double @llvm.ceil.f64(double) #0 ++declare float @llvm.floor.f32(float) #0 ++declare double @llvm.floor.f64(double) #0 ++declare float @llvm.round.f32(float) #0 ++declare double @llvm.round.f64(double) #0 ++declare float @llvm.nearbyint.f32(float) #0 ++declare double @llvm.nearbyint.f64(double) #0 ++declare float @llvm.rint.f32(float) #0 ++declare double @llvm.rint.f64(double) #0 ++declare float @llvm.trunc.f32(float) #0 ++declare double @llvm.trunc.f64(double) #0 ++declare float @llvm.fabs.f32(float) #0 ++declare double @llvm.fabs.f64(double) #0 ++declare float @llvm.minnum.f32(float, float) #0 ++declare double @llvm.minnum.f64(double, double) #0 ++declare float @llvm.maxnum.f32(float, float) #0 ++declare double @llvm.maxnum.f64(double, double) #0 ++ ++; ---- ceil ---- ++ ++; CHECK-LABEL: ceil_float ++define float @ceil_float(float %a) { ++ ; CHECK: cvt.rpi.f32.f32 ++ %b = call float @llvm.ceil.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: ceil_float_ftz ++define float @ceil_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rpi.ftz.f32.f32 ++ %b = call float @llvm.ceil.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: ceil_double ++define double @ceil_double(double %a) { ++ ; CHECK: cvt.rpi.f64.f64 ++ %b = call double @llvm.ceil.f64(double %a) ++ ret double %b ++} ++ ++; ---- floor ---- ++ ++; CHECK-LABEL: floor_float ++define float @floor_float(float %a) { ++ ; CHECK: cvt.rmi.f32.f32 ++ %b = call float @llvm.floor.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: floor_float_ftz ++define float @floor_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rmi.ftz.f32.f32 ++ %b = call float @llvm.floor.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: floor_double ++define double @floor_double(double %a) { ++ ; CHECK: cvt.rmi.f64.f64 ++ %b = call double @llvm.floor.f64(double %a) ++ ret double %b ++} ++ ++; ---- round ---- ++ ++; CHECK-LABEL: round_float ++define float @round_float(float %a) { ++ ; CHECK: cvt.rni.f32.f32 ++ %b = call float @llvm.round.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: round_float_ftz ++define float @round_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rni.ftz.f32.f32 ++ %b = call float @llvm.round.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: round_double ++define double @round_double(double %a) { ++ ; CHECK: cvt.rni.f64.f64 ++ %b = call double @llvm.round.f64(double %a) ++ ret double %b ++} ++ ++; ---- nearbyint ---- ++ ++; CHECK-LABEL: nearbyint_float ++define float @nearbyint_float(float %a) { ++ ; CHECK: cvt.rni.f32.f32 ++ %b = call float @llvm.nearbyint.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: nearbyint_float_ftz ++define float @nearbyint_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rni.ftz.f32.f32 ++ %b = call float @llvm.nearbyint.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: nearbyint_double ++define double @nearbyint_double(double %a) { ++ ; CHECK: cvt.rni.f64.f64 ++ %b = call double @llvm.nearbyint.f64(double %a) ++ ret double %b ++} ++ ++; ---- rint ---- ++ ++; CHECK-LABEL: rint_float ++define float @rint_float(float %a) { ++ ; CHECK: cvt.rni.f32.f32 ++ %b = call float @llvm.rint.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: rint_float_ftz ++define float @rint_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rni.ftz.f32.f32 ++ %b = call float @llvm.rint.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: rint_double ++define double @rint_double(double %a) { ++ ; CHECK: cvt.rni.f64.f64 ++ %b = call double @llvm.rint.f64(double %a) ++ ret double %b ++} ++ ++; ---- trunc ---- ++ ++; CHECK-LABEL: trunc_float ++define float @trunc_float(float %a) { ++ ; CHECK: cvt.rzi.f32.f32 ++ %b = call float @llvm.trunc.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: trunc_float_ftz ++define float @trunc_float_ftz(float %a) #1 { ++ ; CHECK: cvt.rzi.ftz.f32.f32 ++ %b = call float @llvm.trunc.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: trunc_double ++define double @trunc_double(double %a) { ++ ; CHECK: cvt.rzi.f64.f64 ++ %b = call double @llvm.trunc.f64(double %a) ++ ret double %b ++} ++ ++; ---- abs ---- ++ ++; CHECK-LABEL: abs_float ++define float @abs_float(float %a) { ++ ; CHECK: abs.f32 ++ %b = call float @llvm.fabs.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: abs_float_ftz ++define float @abs_float_ftz(float %a) #1 { ++ ; CHECK: abs.ftz.f32 ++ %b = call float @llvm.fabs.f32(float %a) ++ ret float %b ++} ++ ++; CHECK-LABEL: abs_double ++define double @abs_double(double %a) { ++ ; CHECK: abs.f64 ++ %b = call double @llvm.fabs.f64(double %a) ++ ret double %b ++} ++ ++; ---- min ---- ++ ++; CHECK-LABEL: min_float ++define float @min_float(float %a, float %b) { ++ ; CHECK: min.f32 ++ %x = call float @llvm.minnum.f32(float %a, float %b) ++ ret float %x ++} ++ ++; CHECK-LABEL: min_imm1 ++define float @min_imm1(float %a) { ++ ; CHECK: min.f32 ++ %x = call float @llvm.minnum.f32(float %a, float 0.0) ++ ret float %x ++} ++ ++; CHECK-LABEL: min_imm2 ++define float @min_imm2(float %a) { ++ ; CHECK: min.f32 ++ %x = call float @llvm.minnum.f32(float 0.0, float %a) ++ ret float %x ++} ++ ++; CHECK-LABEL: min_float_ftz ++define float @min_float_ftz(float %a, float %b) #1 { ++ ; CHECK: min.ftz.f32 ++ %x = call float @llvm.minnum.f32(float %a, float %b) ++ ret float %x ++} ++ ++; CHECK-LABEL: min_double ++define double @min_double(double %a, double %b) { ++ ; CHECK: min.f64 ++ %x = call double @llvm.minnum.f64(double %a, double %b) ++ ret double %x ++} ++ ++; ---- max ---- ++ ++; CHECK-LABEL: max_imm1 ++define float @max_imm1(float %a) { ++ ; CHECK: max.f32 ++ %x = call float @llvm.maxnum.f32(float %a, float 0.0) ++ ret float %x ++} ++ ++; CHECK-LABEL: max_imm2 ++define float @max_imm2(float %a) { ++ ; CHECK: max.f32 ++ %x = call float @llvm.maxnum.f32(float 0.0, float %a) ++ ret float %x ++} ++ ++; CHECK-LABEL: max_float ++define float @max_float(float %a, float %b) { ++ ; CHECK: max.f32 ++ %x = call float @llvm.maxnum.f32(float %a, float %b) ++ ret float %x ++} ++ ++; CHECK-LABEL: max_float_ftz ++define float @max_float_ftz(float %a, float %b) #1 { ++ ; CHECK: max.ftz.f32 ++ %x = call float @llvm.maxnum.f32(float %a, float %b) ++ ret float %x ++} ++ ++; CHECK-LABEL: max_double ++define double @max_double(double %a, double %b) { ++ ; CHECK: max.f64 ++ %x = call double @llvm.maxnum.f64(double %a, double %b) ++ ret double %x ++} ++ ++attributes #0 = { nounwind readnone } ++attributes #1 = { "nvptx-f32ftz" = "true" } diff --git a/deps/patches/llvm-D9168_argument_alignment.patch b/deps/patches/llvm-D9168_argument_alignment.patch new file mode 100644 index 0000000000000..8166cc379f419 --- /dev/null +++ b/deps/patches/llvm-D9168_argument_alignment.patch @@ -0,0 +1,98 @@ +Index: lib/Target/NVPTX/NVPTXISelLowering.h +=================================================================== +--- a/lib/Target/NVPTX/NVPTXISelLowering.h ++++ b/lib/Target/NVPTX/NVPTXISelLowering.h +@@ -539,7 +539,8 @@ + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, +- Type *Ty, unsigned Idx) const; ++ Type *Ty, unsigned Idx, ++ const DataLayout &DL) const; + }; + } // namespace llvm + +Index: lib/Target/NVPTX/NVPTXISelLowering.cpp +=================================================================== +--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp ++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp +@@ -1024,11 +1024,15 @@ + return O.str(); + } + +-unsigned +-NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, +- const ImmutableCallSite *CS, +- Type *Ty, +- unsigned Idx) const { ++unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, ++ const ImmutableCallSite *CS, ++ Type *Ty, unsigned Idx, ++ const DataLayout &DL) const { ++ if (!CS) { ++ // CallSite is zero, fallback to ABI type alignment ++ return DL.getABITypeAlignment(Ty); ++ } ++ + unsigned Align = 0; + const Value *DirectCallee = CS->getCalledFunction(); + +@@ -1046,7 +1050,7 @@ + + const Value *CalleeV = cast(CalleeI)->getCalledValue(); + // Ignore any bitcast instructions +- while(isa(CalleeV)) { ++ while (isa(CalleeV)) { + const ConstantExpr *CE = cast(CalleeV); + if (!CE->isCast()) + break; +@@ -1069,7 +1073,6 @@ + + // Call is indirect or alignment information is not available, fall back to + // the ABI type alignment +- auto &DL = CS->getCaller()->getParent()->getDataLayout(); + return DL.getABITypeAlignment(Ty); + } + +@@ -1126,7 +1129,8 @@ + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets, + 0); + +- unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); ++ unsigned align = ++ getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + // declare .param .align .b8 .param[]; + unsigned sz = DL.getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); +@@ -1166,7 +1170,8 @@ + } + if (Ty->isVectorTy()) { + EVT ObjectVT = getValueType(DL, Ty); +- unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); ++ unsigned align = ++ getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + // declare .param .align .b8 .param[]; + unsigned sz = DL.getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); +@@ -1426,7 +1431,7 @@ + DeclareRetOps); + InFlag = Chain.getValue(1); + } else { +- retAlignment = getArgumentAlignment(Callee, CS, retTy, 0); ++ retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL); + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, + DAG.getConstant(retAlignment, dl, MVT::i32), +@@ -1633,9 +1638,10 @@ + } else { + SmallVector VTs; + SmallVector Offsets; +- ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0); ++ auto &DL = DAG.getDataLayout(); ++ ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); +- unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); ++ unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL); + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + unsigned sz = VTs[i].getSizeInBits(); + unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);