From 87956e18259fb0fbd82c104b479a775687944b41 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Mon, 10 Jul 2023 11:16:08 +0200 Subject: [PATCH] OpenXLA-specific changes. - Disable short pointer. - Fixup expensiveLoadOrStore(). --- BUILD | 1 + lib/Dialect/TritonGPU/Transforms/Utility.cpp | 5 +++++ lib/Target/LLVMIR/LLVMIRTranslation.cpp | 5 ++--- lib/Target/PTX/PTXTranslation.cpp | 2 +- test/TritonGPU/combine.mlir | 12 ------------ third_party/intel_xpu_backend | 1 - 6 files changed, 9 insertions(+), 17 deletions(-) delete mode 160000 third_party/intel_xpu_backend diff --git a/BUILD b/BUILD index 644177fe8abb..ab25e4653cdd 100644 --- a/BUILD +++ b/BUILD @@ -350,6 +350,7 @@ cc_library( name = "TritonTransforms", srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]), hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]), + copts = ["-Wno-parentheses"], includes = ["include"], deps = [ ":TritonDialects", diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index 95130a3f13fd..ce4455f04655 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -93,6 +93,11 @@ bool isExpensiveLoadOrStore(Operation *op, Attribute &targetEncoding) { // same if (isSingleValue(op->getOperand(0))) return false; + // TODO(manany): Investigate with Openai why the change here + // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a + // which introduces Case 2 causes breakage to this test + // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd + return true; // Case 2: Tensor of pointers has more threads than elements // we can presume a high hit-rate that makes it cheap to load auto ptrType = op->getOperand(0).getType().cast(); diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp index f2e90e9de874..409aae0b0e80 100644 --- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp +++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp @@ -27,6 +27,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Support/SourceMgr.h" +#include "third_party/py/triton/google/find_cuda.h" #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include @@ -187,10 +188,8 @@ static std::map getExternLibs(mlir::ModuleOp module) { // Search for libdevice relative to its library path if used from Python // Then native code is in `triton/_C/libtriton.so` and libdevice in // `triton/third_party/cuda/lib/libdevice.10.bc` - static const auto this_library_path = getThisLibraryPath(); static const auto runtime_path = - this_library_path.parent_path().parent_path() / "third_party" / "cuda" / - "lib" / "libdevice.10.bc"; + fs::path(PathToLibdevice()) / "libdevice.10.bc"; if (fs::exists(runtime_path)) { externLibs.try_emplace(libdevice, runtime_path.string()); } else { diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp index 6431b6ae8d89..bc29f18ec773 100644 --- a/lib/Target/PTX/PTXTranslation.cpp +++ b/lib/Target/PTX/PTXTranslation.cpp @@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) { auto *shortPtr = static_cast *>(options["nvptx-short-ptr"]); assert(shortPtr); - shortPtr->setValue(true); + shortPtr->setValue(false); std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc); // max PTX version int ptxMajor = maxPTX / 10; diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir index e123841e42cc..1af66e6ff12e 100644 --- a/test/TritonGPU/combine.mlir +++ b/test/TritonGPU/combine.mlir @@ -69,18 +69,6 @@ tt.func @remat_single_value(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { tt.return } -tt.func @remat_fast_load(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { - %0 = tt.splat %arg : (!tt.ptr) -> tensor<16x!tt.ptr, #layout1> - %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #layout1> - %2 = tt.addptr %0, %1 : tensor<16x!tt.ptr, #layout1>, tensor<16xi32, #layout1> - %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16xi32, #layout1> - // CHECK-NOT: triton_gpu.convert_layout - %4 = triton_gpu.convert_layout %3 : (tensor<16xi32, #layout1>) -> tensor<16xi32, #layout0> - %5 = triton_gpu.convert_layout %2 : (tensor<16x!tt.ptr, #layout1>) -> tensor<16x!tt.ptr, #layout0> - tt.store %5, %4 : tensor<16xi32, #layout0> - tt.return -} - // CHECK-LABEL: if tt.func @if(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { // CHECK-NOT: triton_gpu.convert_layout diff --git a/third_party/intel_xpu_backend b/third_party/intel_xpu_backend deleted file mode 160000 index 0bcc485f82b3..000000000000 --- a/third_party/intel_xpu_backend +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0bcc485f82b34d49494bd0264bacc24a20aafb7a