diff --git a/BUILD b/BUILD
index 644177fe8abb..65e12b002f49 100644
--- a/BUILD
+++ b/BUILD
@@ -53,6 +53,11 @@ _no_unused_variable = select({
     "//conditions:default": ["-Wno-unused-variable"],
 })
 
+_no_unused_variable_no_parentheses = select({
+    ":compiler_is_msvc": [],
+    "//conditions:default": ["-Wno-unused-variable -Wno-parentheses"],
+})
+
 td_library(
     name = "td_files",
     srcs = glob(["include/triton/**/*.td"]),
@@ -350,6 +355,7 @@ cc_library(
     name = "TritonTransforms",
     srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]),
     hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]),
+    copts = ["-Wno-parentheses"],
     includes = ["include"],
     deps = [
         ":TritonDialects",
@@ -413,7 +419,7 @@ cc_library(
         "include/triton/Tools/Sys/*.hpp",
         "include/triton/Conversion/TritonGPUToLLVM/*.h",
     ]),
-    copts = _no_unused_variable,
+    copts = _no_unused_variable_no_parentheses,
     includes = [
         "include",
         "lib/Conversion/TritonGPUToLLVM",
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 1921d25b2132..bc9ab42d42d0 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -125,7 +125,7 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() {
 
   unsigned bytesPerElem = 0;
   for (const auto &ty : srcElementTypes) {
-    bytesPerElem += ty.getIntOrFloatBitWidth() / 8;
+    bytesPerElem += (ty.getIntOrFloatBitWidth() + 7) / 8;
   }
   return bytesPerElem * elems;
 }
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
index 8129b0092fc8..e6a69042f8b9 100644
--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -91,7 +91,7 @@ class BlockedToMMA : public mlir::RewritePattern {
     int finalBitWidth = getElementTypeOrSelf(x).getIntOrFloatBitWidth();
     int origBitWidth = finalBitWidth;
     SetVector<Operation *> slice;
-    mlir::getBackwardSlice(x, &slice, bwdFilter);
+    mlir::getBackwardSlice(x, &slice, {{bwdFilter}});
     Operation *firstOp = slice.empty() ? nullptr : *slice.begin();
     if (firstOp)
       if (Value arg = firstOp->getOperand(0))
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
index 95130a3f13fd..ce4455f04655 100644
--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -93,6 +93,11 @@ bool isExpensiveLoadOrStore(Operation *op, Attribute &targetEncoding) {
   // same
   if (isSingleValue(op->getOperand(0)))
     return false;
+  // TODO(manany): Investigate with Openai why the change here
+  // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a
+  // which introduces Case 2 causes breakage to this test
+  // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd
+  return true;
   // Case 2: Tensor of pointers has more threads than elements
   // we can presume a high hit-rate that makes it cheap to load
   auto ptrType = op->getOperand(0).getType().cast<RankedTensorType>();
diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
index f2e90e9de874..409aae0b0e80 100644
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/SourceMgr.h"
+#include "third_party/py/triton/google/find_cuda.h"
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@@ -187,10 +188,8 @@ static std::map<std::string, std::string> getExternLibs(mlir::ModuleOp module) {
     // Search for libdevice relative to its library path if used from Python
     // Then native code is in `triton/_C/libtriton.so` and libdevice in
     // `triton/third_party/cuda/lib/libdevice.10.bc`
-    static const auto this_library_path = getThisLibraryPath();
     static const auto runtime_path =
-        this_library_path.parent_path().parent_path() / "third_party" / "cuda" /
-        "lib" / "libdevice.10.bc";
+        fs::path(PathToLibdevice()) / "libdevice.10.bc";
     if (fs::exists(runtime_path)) {
       externLibs.try_emplace(libdevice, runtime_path.string());
     } else {
diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp
index f089f1b16874..d95b3ccbc349 100644
--- a/lib/Target/PTX/PTXTranslation.cpp
+++ b/lib/Target/PTX/PTXTranslation.cpp
@@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) {
   auto *shortPtr =
       static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
   assert(shortPtr);
-  shortPtr->setValue(true);
+  shortPtr->setValue(false);
   std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc);
   // max PTX version
   int ptxMajor = maxPTX / 10;
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
index e123841e42cc..1af66e6ff12e 100644
--- a/test/TritonGPU/combine.mlir
+++ b/test/TritonGPU/combine.mlir
@@ -69,18 +69,6 @@ tt.func @remat_single_value(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
   tt.return
 }
 
-tt.func @remat_fast_load(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-  %0 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<16x!tt.ptr<i32>, #layout1>
-  %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #layout1>
-  %2 = tt.addptr %0, %1 : tensor<16x!tt.ptr<i32>, #layout1>, tensor<16xi32, #layout1>
-  %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16xi32, #layout1>
-  // CHECK-NOT: triton_gpu.convert_layout
-  %4 = triton_gpu.convert_layout %3 : (tensor<16xi32, #layout1>) -> tensor<16xi32, #layout0>
-  %5 = triton_gpu.convert_layout %2 : (tensor<16x!tt.ptr<i32>, #layout1>) -> tensor<16x!tt.ptr<i32>, #layout0>
-  tt.store %5, %4 : tensor<16xi32, #layout0>
-  tt.return
-}
-
 // CHECK-LABEL: if
 tt.func @if(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
   // CHECK-NOT: triton_gpu.convert_layout
diff --git a/third_party/intel_xpu_backend b/third_party/intel_xpu_backend
deleted file mode 160000
index 0bcc485f82b3..000000000000
--- a/third_party/intel_xpu_backend
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0bcc485f82b34d49494bd0264bacc24a20aafb7a