Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Add a pseudo GPU sm_next which allows overriding for SM/PTX version. #100247

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions clang/include/clang/Basic/Cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#ifndef LLVM_CLANG_BASIC_CUDA_H
#define LLVM_CLANG_BASIC_CUDA_H

#include "llvm/ADT/StringRef.h"
namespace llvm {
class StringRef;
class Twine;
Expand Down Expand Up @@ -52,6 +53,42 @@ const char *CudaVersionToString(CudaVersion V);
// Input is "Major.Minor"
CudaVersion CudaStringToVersion(const llvm::Twine &S);

enum class PTXVersion {
PTX_UNKNOWN = 0,
PTX_32 = 32,
PTX_40 = 40,
PTX_41,
PTX_42,
PTX_43,
PTX_50 = 50,
PTX_60 = 60,
PTX_61,
PTX_62,
PTX_63,
PTX_64,
PTX_65,
PTX_70 = 70,
PTX_71,
PTX_72,
PTX_73,
PTX_74,
PTX_75,
PTX_76,
PTX_77,
PTX_78,
PTX_80 = 80,
PTX_81,
PTX_82,
PTX_83,
PTX_84,
PTX_85,
PTX_LAST = PTX_85,
PTX_custom = 9999, // placeholder for an unknown future version.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we use UINT32_MAX or something?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've just used the same value I use in NVPTX.td on LLVM side, where I can't use UINT_MAX, though it's just for consistency. In case someone searches for 9999, they will find all related occurrences.

Specific values do not matter in either place, as long as they are distinct from other values.

};

const std::string PTXVersionToFeature(PTXVersion V);
PTXVersion GetRequiredPTXVersion(CudaVersion V);

enum class OffloadArch {
UNUSED,
UNKNOWN,
Expand All @@ -78,6 +115,7 @@ enum class OffloadArch {
SM_89,
SM_90,
SM_90a,
SM_custom,
GFX600,
GFX601,
GFX602,
Expand Down Expand Up @@ -160,6 +198,12 @@ const char *OffloadArchToVirtualArchString(OffloadArch A);
// The input should have the form "sm_20".
OffloadArch StringToOffloadArch(llvm::StringRef S);

// Converts custom SM name to its numeric value to be used in __CUDA_ARCH__
// Custom SM name format: `sm_[ID][suffix]`.
// The function returns `ID`*10 or zero on error.
// `suffix` is expected to be empty or `a` and is ignored otherwise.
unsigned CUDACustomSMToArchID(llvm::StringRef S);

/// Get the earliest CudaVersion that supports the given OffloadArch.
CudaVersion MinVersionForOffloadArch(OffloadArch A);

Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticDriverKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,8 @@ def err_drv_invalid_or_unsupported_offload_target : Error<
"invalid or unsupported offload target: '%0'">;
def err_drv_cuda_offload_only_emit_bc : Error<
"CUDA offload target is supported only along with --emit-llvm">;
def err_drv_sm_custom_args : Error<
"offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified">;

def warn_drv_jmc_requires_debuginfo : Warning<
"%0 requires debug info. Use %1 or debug options that enable debugger's "
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/LangOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,10 @@ class LangOptions : public LangOptionsBase {
// WebAssembly target.
bool NoWasmOpt = false;

// Overrides for the custom SM/PTX variants for CUDA's sm_custom target.
std::string CUDACustomSM;
unsigned CUDACustomPTX = 0;

LangOptions();

/// Set language defaults for the given input language and
Expand Down
11 changes: 11 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1464,6 +1464,17 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
}

def cuda_custom_sm_EQ : Joined<["--"], "cuda-custom-sm=">,
Visibility<[ClangOption, CC1Option]>,
HelpText<"SM version to use for sm_custom GPU">,
MarshallingInfoString<LangOpts<"CUDACustomSM">>,
ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
def cuda_custom_ptx_EQ : Joined<["--"], "cuda-custom-ptx=">,
Visibility<[ClangOption, CC1Option]>,
HelpText<"SM version to use for sm_custom GPU">,
MarshallingInfoInt<LangOpts<"CUDACustomPTX">, "0">,
ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;

// Clang specific/exclusive options for OpenACC.
def openacc_macro_override
: Separate<["-"], "fexperimental-openacc-macro-override">,
Expand Down
7 changes: 7 additions & 0 deletions clang/include/clang/Driver/ToolChain.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,13 @@ class ToolChain {
virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadKind) const;
/// [optional] Some toolchains may need more info and need to pass JobAction.
/// This is only intended to augment the function above.
virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args,
const JobAction &JC) const {
addClangTargetOptions(DriverArgs, CC1Args, JC.getOffloadingDeviceKind());
}

/// Add options that need to be passed to cc1as for this target.
virtual void
Expand Down
94 changes: 62 additions & 32 deletions clang/lib/Basic/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/VersionTuple.h"

namespace clang {
Expand All @@ -11,41 +12,41 @@ struct CudaVersionMapEntry {
const char *Name;
CudaVersion Version;
llvm::VersionTuple TVersion;
PTXVersion PTX;
};
#define CUDA_ENTRY(major, minor) \
{ \
#major "." #minor, CudaVersion::CUDA_##major##minor, \
llvm::VersionTuple(major, minor) \
}
#define CUDA_ENTRY(major, minor, ptx) \
{#major "." #minor, CudaVersion::CUDA_##major##minor, \
llvm::VersionTuple(major, minor), PTXVersion::ptx}

static const CudaVersionMapEntry CudaNameVersionMap[] = {
CUDA_ENTRY(7, 0),
CUDA_ENTRY(7, 5),
CUDA_ENTRY(8, 0),
CUDA_ENTRY(9, 0),
CUDA_ENTRY(9, 1),
CUDA_ENTRY(9, 2),
CUDA_ENTRY(10, 0),
CUDA_ENTRY(10, 1),
CUDA_ENTRY(10, 2),
CUDA_ENTRY(11, 0),
CUDA_ENTRY(11, 1),
CUDA_ENTRY(11, 2),
CUDA_ENTRY(11, 3),
CUDA_ENTRY(11, 4),
CUDA_ENTRY(11, 5),
CUDA_ENTRY(11, 6),
CUDA_ENTRY(11, 7),
CUDA_ENTRY(11, 8),
CUDA_ENTRY(12, 0),
CUDA_ENTRY(12, 1),
CUDA_ENTRY(12, 2),
CUDA_ENTRY(12, 3),
CUDA_ENTRY(12, 4),
CUDA_ENTRY(12, 5),
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
{"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
};
CUDA_ENTRY(7, 0, PTX_42),
CUDA_ENTRY(7, 5, PTX_43),
CUDA_ENTRY(8, 0, PTX_50),
CUDA_ENTRY(9, 0, PTX_60),
CUDA_ENTRY(9, 1, PTX_61),
CUDA_ENTRY(9, 2, PTX_62),
CUDA_ENTRY(10, 0, PTX_63),
CUDA_ENTRY(10, 1, PTX_64),
CUDA_ENTRY(10, 2, PTX_65),
CUDA_ENTRY(11, 0, PTX_70),
CUDA_ENTRY(11, 1, PTX_71),
CUDA_ENTRY(11, 2, PTX_72),
CUDA_ENTRY(11, 3, PTX_73),
CUDA_ENTRY(11, 4, PTX_74),
CUDA_ENTRY(11, 5, PTX_75),
CUDA_ENTRY(11, 6, PTX_76),
CUDA_ENTRY(11, 7, PTX_77),
CUDA_ENTRY(11, 8, PTX_78),
CUDA_ENTRY(12, 0, PTX_80),
CUDA_ENTRY(12, 1, PTX_81),
CUDA_ENTRY(12, 2, PTX_82),
CUDA_ENTRY(12, 3, PTX_83),
CUDA_ENTRY(12, 4, PTX_84),
CUDA_ENTRY(12, 5, PTX_85),
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max()),
PTXVersion::PTX_LAST},
// End of list tombstone
{"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}};
#undef CUDA_ENTRY

const char *CudaVersionToString(CudaVersion V) {
Expand All @@ -71,6 +72,20 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) {
return CudaVersion::UNKNOWN;
}

const std::string PTXVersionToFeature(PTXVersion V) {
if (V > PTXVersion::PTX_UNKNOWN && V <= PTXVersion::PTX_LAST)
return llvm::formatv("+ptx{0}", static_cast<unsigned>(V));
return {};
}

PTXVersion GetRequiredPTXVersion(CudaVersion V) {
for (auto &I : CudaNameVersionMap)
if (V == I.Version)
return I.PTX;

return PTXVersion::PTX_UNKNOWN;
}

namespace {
struct OffloadArchToStringMap {
OffloadArch arch;
Expand All @@ -96,6 +111,7 @@ static const OffloadArchToStringMap arch_names[] = {
SM(89), // Ada Lovelace
SM(90), // Hopper
SM(90a), // Hopper
SM(custom), // Placeholder for a new arch.
GFX(600), // gfx600
GFX(601), // gfx601
GFX(602), // gfx602
Expand Down Expand Up @@ -181,6 +197,18 @@ OffloadArch StringToOffloadArch(llvm::StringRef S) {
return result->arch;
}

unsigned CUDACustomSMToArchID(llvm::StringRef S) {
if (!S.starts_with("sm_"))
return 0;
S = S.drop_front(3); // skip `sm_`
if (S.ends_with("a"))
S = S.drop_back(1);
unsigned ID;
if (S.getAsInteger(10, ID))
return 0; // We've failed to parse the SM name
return ID * 10;
}

CudaVersion MinVersionForOffloadArch(OffloadArch A) {
if (A == OffloadArch::UNKNOWN)
return CudaVersion::UNKNOWN;
Expand Down Expand Up @@ -221,6 +249,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
return CudaVersion::CUDA_118;
case OffloadArch::SM_90a:
return CudaVersion::CUDA_120;
case clang::OffloadArch::SM_custom:
return CudaVersion::UNKNOWN;
default:
llvm_unreachable("invalid enum");
}
Expand Down
6 changes: 5 additions & 1 deletion clang/lib/Basic/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
#include "NVPTX.h"
#include "Targets.h"
#include "clang/Basic/Builtins.h"
#include "clang/Basic/Cuda.h"
#include "clang/Basic/MacroBuilder.h"
#include "clang/Basic/TargetBuiltins.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"

using namespace clang;
Expand Down Expand Up @@ -180,7 +182,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,

if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
// Set __CUDA_ARCH__ for the GPU specified.
std::string CUDAArchCode = [this] {
std::string CUDAArchCode = [&]() -> std::string {
switch (GPU) {
case OffloadArch::GFX600:
case OffloadArch::GFX601:
Expand Down Expand Up @@ -281,6 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case OffloadArch::SM_90:
case OffloadArch::SM_90a:
return "900";
case OffloadArch::SM_custom:
return llvm::itostr(CUDACustomSMToArchID(Opts.CUDACustomSM));
}
llvm_unreachable("unhandled OffloadArch");
}();
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
case OffloadArch::SM_89:
case OffloadArch::SM_90:
case OffloadArch::SM_90a:
case OffloadArch::SM_custom:
case OffloadArch::GFX600:
case OffloadArch::GFX601:
case OffloadArch::GFX602:
Expand Down
9 changes: 5 additions & 4 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,8 @@ void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
AddUnalignedAccessWarning(CmdArgs);
}

void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
void Clang::RenderTargetOptions(const JobAction &JA,
const llvm::Triple &EffectiveTriple,
const ArgList &Args, bool KernelOrKext,
ArgStringList &CmdArgs) const {
const ToolChain &TC = getToolChain();
Expand Down Expand Up @@ -5378,7 +5379,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-disable-llvm-passes");

// Render target options.
TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
TC.addClangTargetOptions(Args, CmdArgs, JA);

// reject options that shouldn't be supported in bitcode
// also reject kernel/kext
Expand Down Expand Up @@ -6069,7 +6070,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
/*ForAS*/ false, /*IsAux*/ true);
}

TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
TC.addClangTargetOptions(Args, CmdArgs, JA);

addMCModel(D, Args, Triple, RelocationModel, CmdArgs);

Expand All @@ -6096,7 +6097,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back(Args.MakeArgString(CPU));
}

RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);
RenderTargetOptions(JA, Triple, Args, KernelOrKext, CmdArgs);

// Add clang-cl arguments.
types::ID InputType = Input.getType();
Expand Down
9 changes: 6 additions & 3 deletions clang/lib/Driver/ToolChains/Clang.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
const InputInfo &Output,
const InputInfoList &Inputs) const;

void RenderTargetOptions(const llvm::Triple &EffectiveTriple,
void RenderTargetOptions(const JobAction &JA,
const llvm::Triple &EffectiveTriple,
const llvm::opt::ArgList &Args, bool KernelOrKext,
llvm::opt::ArgStringList &CmdArgs) const;

Expand All @@ -61,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
llvm::opt::ArgStringList &CmdArgs) const;
void AddMIPSTargetArgs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
void AddNVPTXTargetArgs(const JobAction &JA, const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
void AddPPCTargetArgs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
void AddR600TargetArgs(const llvm::opt::ArgList &Args,
Expand Down Expand Up @@ -94,8 +97,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {

mutable std::unique_ptr<llvm::raw_fd_ostream> CompilationDatabase = nullptr;
void DumpCompilationDatabase(Compilation &C, StringRef Filename,
StringRef Target,
const InputInfo &Output, const InputInfo &Input,
StringRef Target, const InputInfo &Output,
const InputInfo &Input,
const llvm::opt::ArgList &Args) const;

void DumpCompilationDatabaseFragmentToDir(
Expand Down
Loading
Loading