From 87956e18259fb0fbd82c104b479a775687944b41 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 10 Jul 2023 11:16:08 +0200
Subject: [PATCH] OpenXLA-specific changes.

- Disable short pointer.
- Fixup expensiveLoadOrStore().
---
 BUILD                                        |  1 +
 lib/Dialect/TritonGPU/Transforms/Utility.cpp |  5 +++++
 lib/Target/LLVMIR/LLVMIRTranslation.cpp      |  5 ++---
 lib/Target/PTX/PTXTranslation.cpp            |  2 +-
 test/TritonGPU/combine.mlir                  | 12 ------------
 third_party/intel_xpu_backend                |  1 -
 6 files changed, 9 insertions(+), 17 deletions(-)
 delete mode 160000 third_party/intel_xpu_backend
diff --git a/BUILD b/BUILD
index 644177fe8abb..ab25e4653cdd 100644
--- a/BUILD
+++ b/BUILD
@@ -350,6 +350,7 @@ cc_library(
     name = "TritonTransforms",
     srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]),
     hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]),
+    copts = ["-Wno-parentheses"],
     includes = ["include"],
     deps = [
         ":TritonDialects",
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index 95130a3f13fd..ce4455f04655 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -93,6 +93,11 @@ bool isExpensiveLoadOrStore(Operation *op, Attribute &targetEncoding) {
   // same
   if (isSingleValue(op->getOperand(0)))
     return false;
+  // TODO(manany): Investigate with Openai why the change here
+  // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a
+  // which introduces Case 2 causes breakage to this test
+  // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd
+  return true;
   // Case 2: Tensor of pointers has more threads than elements
   // we can presume a high hit-rate that makes it cheap to load
   auto ptrType = op->getOperand(0).getType().cast<RankedTensorType>();
diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
index f2e90e9de874..409aae0b0e80 100644
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/SourceMgr.h"
+#include "third_party/py/triton/google/find_cuda.h"
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@@ -187,10 +188,8 @@ static std::map<std::string, std::string> getExternLibs(mlir::ModuleOp module) {
     // Search for libdevice relative to its library path if used from Python
     // Then native code is in `triton/_C/libtriton.so` and libdevice in
     // `triton/third_party/cuda/lib/libdevice.10.bc`
-    static const auto this_library_path = getThisLibraryPath();
     static const auto runtime_path =
-        this_library_path.parent_path().parent_path() / "third_party" / "cuda" /
-        "lib" / "libdevice.10.bc";
+        fs::path(PathToLibdevice()) / "libdevice.10.bc";
     if (fs::exists(runtime_path)) {
       externLibs.try_emplace(libdevice, runtime_path.string());
     } else {
diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp
index 6431b6ae8d89..bc29f18ec773 100644
--- a/lib/Target/PTX/PTXTranslation.cpp
+++ b/lib/Target/PTX/PTXTranslation.cpp
@@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) {
   auto *shortPtr =
       static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
   assert(shortPtr);
-  shortPtr->setValue(true);
+  shortPtr->setValue(false);
   std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc);
   // max PTX version
   int ptxMajor = maxPTX / 10;
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
index e123841e42cc..1af66e6ff12e 100644
--- a/test/TritonGPU/combine.mlir
+++ b/test/TritonGPU/combine.mlir
@@ -69,18 +69,6 @@ tt.func @remat_single_value(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
   tt.return
 }
 
-tt.func @remat_fast_load(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-  %0 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<16x!tt.ptr<i32>, #layout1>
-  %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #layout1>
-  %2 = tt.addptr %0, %1 : tensor<16x!tt.ptr<i32>, #layout1>, tensor<16xi32, #layout1>
-  %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16xi32, #layout1>
-  // CHECK-NOT: triton_gpu.convert_layout
-  %4 = triton_gpu.convert_layout %3 : (tensor<16xi32, #layout1>) -> tensor<16xi32, #layout0>
-  %5 = triton_gpu.convert_layout %2 : (tensor<16x!tt.ptr<i32>, #layout1>) -> tensor<16x!tt.ptr<i32>, #layout0>
-  tt.store %5, %4 : tensor<16xi32, #layout0>
-  tt.return
-}
-
 // CHECK-LABEL: if
 tt.func @if(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
   // CHECK-NOT: triton_gpu.convert_layout
diff --git a/third_party/intel_xpu_backend b/third_party/intel_xpu_backend
deleted file mode 160000
index 0bcc485f82b3..000000000000
--- a/third_party/intel_xpu_backend
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0bcc485f82b34d49494bd0264bacc24a20aafb7a