diff --git a/xla/service/gpu/buffer_comparator.cc b/xla/service/gpu/buffer_comparator.cc index 873a71cc09d71..5d3cbc21f0675 100644 --- a/xla/service/gpu/buffer_comparator.cc +++ b/xla/service/gpu/buffer_comparator.cc @@ -522,7 +522,8 @@ static const char* buffer_compare_ptx = R"( $L__BB1_9: ret; -} +})" + R"( // .globl __xla_fp16_comparison .visible .entry __xla_fp16_comparison( .param .u64 __xla_fp16_comparison_param_0, diff --git a/xla/service/gpu/hlo_op_profiles.h b/xla/service/gpu/hlo_op_profiles.h index c993f9e8f654a..85bf9faa344f0 100644 --- a/xla/service/gpu/hlo_op_profiles.h +++ b/xla/service/gpu/hlo_op_profiles.h @@ -589,142 +589,141 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( } } - entries { - key: "sm_80" - value { - entries { - instruction { - opcode: "divide" - shape { element_type: S8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S16 } - } - clock_cycles: 468 - } - entries { - instruction { - opcode: "divide" - shape { element_type: S64 } - } - clock_cycles: 1094 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U8 } - } - clock_cycles: 420 - } - entries { - instruction { - opcode: "power" - shape { element_type: U8 } - } - clock_cycles: 417 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U16 } - } - clock_cycles: 391 - } - entries { - instruction { - opcode: "power" - shape { element_type: U16 } - } - clock_cycles: 454 - } - entries { - instruction { - opcode: "divide" - shape { element_type: U64 } - } - clock_cycles: 908 - } - entries { - instruction { - opcode: "power" - shape { element_type: U64 } - } - clock_cycles: 744 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F16 } - } - clock_cycles: 1195 - } - entries { - instruction { - opcode: "log" - shape { element_type: F16 } - } - clock_cycles: 321 - } - entries { - instruction { - opcode: "log-plus-one" - shape { element_type: F16 } - } - clock_cycles: 346 - } - entries { - instruction { - opcode: "sqrt" - shape { element_type: F16 } - } - clock_cycles: 124 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F16 } - } - clock_cycles: 499 - } - entries { - instruction { - opcode: "log" - shape { element_type: F32 } - } - clock_cycles: 259 - } - entries { - instruction { - opcode: "tanh" - shape { element_type: F32 } - } - clock_cycles: 504 - } - entries { - instruction { - opcode: "power" - shape { element_type: F32 } - } - clock_cycles: 1221 - } - entries { - instruction { - opcode: "cbrt" - shape { element_type: F64 } - } - clock_cycles: 1638 - } - entries { - instruction { - opcode: "exponential-minus-one" - shape { element_type: F64 } - } - clock_cycles: 572 - } + entries { key: "sm_80" + value { entries { + instruction { + opcode: "divide" + shape { element_type: S8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S16 } + } + clock_cycles: 468 + } + entries { + instruction { + opcode: "divide" + shape { element_type: S64 } + } + clock_cycles: 1094 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U8 } + } + clock_cycles: 420 + } + entries { + instruction { + opcode: "power" + shape { element_type: U8 } + } + clock_cycles: 417 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U16 } + } + clock_cycles: 391 + } + entries { + instruction { + opcode: "power" + shape { element_type: U16 } + } + clock_cycles: 454 + } + entries { + instruction { + opcode: "divide" + shape { element_type: U64 } + } + clock_cycles: 908 + } + entries { + instruction { + opcode: "power" + shape { element_type: U64 } + } + clock_cycles: 744 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F16 } + } + clock_cycles: 1195 + } + entries { + instruction { + opcode: "log" + shape { element_type: F16 } + } + clock_cycles: 321 + } + entries { + instruction { + opcode: "log-plus-one" + shape { element_type: F16 } + } + clock_cycles: 346 + } + entries { + instruction { + opcode: "sqrt" + shape { element_type: F16 } + } + clock_cycles: 124 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F16 } + } + clock_cycles: 499 + } + entries { + instruction { + opcode: "log" + shape { element_type: F32 } + } + clock_cycles: 259 + } + entries { + instruction { + opcode: "tanh" + shape { element_type: F32 } + } + clock_cycles: 504 + } + entries { + instruction { + opcode: "power" + shape { element_type: F32 } + } + clock_cycles: 1221 + } + entries { + instruction { + opcode: "cbrt" + shape { element_type: F64 } + } + clock_cycles: 1638 + } + entries { + instruction { + opcode: "exponential-minus-one" + shape { element_type: F64 } + } + clock_cycles: 572 + })pb" + R"pb( entries { instruction { opcode: "log" @@ -887,7 +886,7 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 6054 } } - } + } entries { key: "sm_70" @@ -1486,6 +1485,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( } clock_cycles: 435 } + )pb" + R"pb( entries { instruction { opcode: "log-plus-one" @@ -1767,7 +1768,7 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 10458 } } - } + } entries { key: "sm_75" @@ -2038,6 +2039,8 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( } clock_cycles: 74 } + )pb" + R"pb( entries { instruction { opcode: "tanh" @@ -2361,7 +2364,7 @@ constexpr char kDeviceHloOpProfiles[] = R"pb( clock_cycles: 97 } } - } + } )pb"; } // namespace gpu diff --git a/xla/service/gpu/triton_autotuner.cc b/xla/service/gpu/triton_autotuner.cc index 8e6f9d32bee8f..578ff05550dd2 100644 --- a/xla/service/gpu/triton_autotuner.cc +++ b/xla/service/gpu/triton_autotuner.cc @@ -740,9 +740,9 @@ std::vector GetPossibleMatmulAutotuneConfigs( constexpr int kSufficientNumberOfTiles = 500; const int max_split_k = debug_options.xla_gpu_enable_split_k_autotuning() - ? std::max(1L, kSufficientNumberOfTiles * kMaxTileSize * - kMaxTileSize / - ShapeUtil::ElementsIn(instr.shape())) + ? std::max(1L, kSufficientNumberOfTiles * kMaxTileSize * + kMaxTileSize / + ShapeUtil::ElementsIn(instr.shape())) : 1; return exhaustive_tiling_search ? GetExhaustiveMatmulAutotuneConfigs(compute_capability,