diff --git a/BUILD b/BUILD index 644177fe8abb..65e12b002f49 100644 --- a/BUILD +++ b/BUILD @@ -53,6 +53,11 @@ _no_unused_variable = select({ "//conditions:default": ["-Wno-unused-variable"], }) +_no_unused_variable_no_parentheses = select({ + ":compiler_is_msvc": [], + "//conditions:default": ["-Wno-unused-variable -Wno-parentheses"], +}) + td_library( name = "td_files", srcs = glob(["include/triton/**/*.td"]), @@ -350,6 +355,7 @@ cc_library( name = "TritonTransforms", srcs = glob(["lib/Dialect/Triton/Transforms/*.cpp"]), hdrs = glob(["include/triton/Dialect/Triton/Transforms/*.h"]), + copts = ["-Wno-parentheses"], includes = ["include"], deps = [ ":TritonDialects", @@ -413,7 +419,7 @@ cc_library( "include/triton/Tools/Sys/*.hpp", "include/triton/Conversion/TritonGPUToLLVM/*.h", ]), - copts = _no_unused_variable, + copts = _no_unused_variable_no_parentheses, includes = [ "include", "lib/Conversion/TritonGPUToLLVM", diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 1921d25b2132..bc9ab42d42d0 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -125,7 +125,7 @@ unsigned ReduceOpHelper::getScratchSizeInBytes() { unsigned bytesPerElem = 0; for (const auto &ty : srcElementTypes) { - bytesPerElem += ty.getIntOrFloatBitWidth() / 8; + bytesPerElem += (ty.getIntOrFloatBitWidth() + 7) / 8; } return bytesPerElem * elems; } diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp index 8129b0092fc8..e6a69042f8b9 100644 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp @@ -91,7 +91,7 @@ class BlockedToMMA : public mlir::RewritePattern { int finalBitWidth = getElementTypeOrSelf(x).getIntOrFloatBitWidth(); int origBitWidth = finalBitWidth; SetVector slice; - mlir::getBackwardSlice(x, &slice, bwdFilter); + mlir::getBackwardSlice(x, &slice, {{bwdFilter}}); Operation *firstOp = slice.empty() ? nullptr : *slice.begin(); if (firstOp) if (Value arg = firstOp->getOperand(0)) diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index 95130a3f13fd..ce4455f04655 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -93,6 +93,11 @@ bool isExpensiveLoadOrStore(Operation *op, Attribute &targetEncoding) { // same if (isSingleValue(op->getOperand(0))) return false; + // TODO(manany): Investigate with Openai why the change here + // https://github.com/openai/triton/commit/640f3c392184cd14291c1bca6a4795eb0f32a61a + // which introduces Case 2 causes breakage to this test + // //third_party/py/jax_triton/tests:pallas_test_sm80 --test_filter=test_fused_attention_bwd + return true; // Case 2: Tensor of pointers has more threads than elements // we can presume a high hit-rate that makes it cheap to load auto ptrType = op->getOperand(0).getType().cast(); diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp index f2e90e9de874..409aae0b0e80 100644 --- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp +++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp @@ -27,6 +27,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Support/SourceMgr.h" +#include "third_party/py/triton/google/find_cuda.h" #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include @@ -187,10 +188,8 @@ static std::map getExternLibs(mlir::ModuleOp module) { // Search for libdevice relative to its library path if used from Python // Then native code is in `triton/_C/libtriton.so` and libdevice in // `triton/third_party/cuda/lib/libdevice.10.bc` - static const auto this_library_path = getThisLibraryPath(); static const auto runtime_path = - this_library_path.parent_path().parent_path() / "third_party" / "cuda" / - "lib" / "libdevice.10.bc"; + fs::path(PathToLibdevice()) / "libdevice.10.bc"; if (fs::exists(runtime_path)) { externLibs.try_emplace(libdevice, runtime_path.string()); } else { diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp index f089f1b16874..d95b3ccbc349 100644 --- a/lib/Target/PTX/PTXTranslation.cpp +++ b/lib/Target/PTX/PTXTranslation.cpp @@ -49,7 +49,7 @@ std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) { auto *shortPtr = static_cast *>(options["nvptx-short-ptr"]); assert(shortPtr); - shortPtr->setValue(true); + shortPtr->setValue(false); std::string sm = cc == 90 ? "sm_90a" : "sm_" + std::to_string(cc); // max PTX version int ptxMajor = maxPTX / 10; diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir index e123841e42cc..1af66e6ff12e 100644 --- a/test/TritonGPU/combine.mlir +++ b/test/TritonGPU/combine.mlir @@ -69,18 +69,6 @@ tt.func @remat_single_value(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { tt.return } -tt.func @remat_fast_load(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { - %0 = tt.splat %arg : (!tt.ptr) -> tensor<16x!tt.ptr, #layout1> - %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #layout1> - %2 = tt.addptr %0, %1 : tensor<16x!tt.ptr, #layout1>, tensor<16xi32, #layout1> - %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16xi32, #layout1> - // CHECK-NOT: triton_gpu.convert_layout - %4 = triton_gpu.convert_layout %3 : (tensor<16xi32, #layout1>) -> tensor<16xi32, #layout0> - %5 = triton_gpu.convert_layout %2 : (tensor<16x!tt.ptr, #layout1>) -> tensor<16x!tt.ptr, #layout0> - tt.store %5, %4 : tensor<16xi32, #layout0> - tt.return -} - // CHECK-LABEL: if tt.func @if(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { // CHECK-NOT: triton_gpu.convert_layout diff --git a/third_party/intel_xpu_backend b/third_party/intel_xpu_backend deleted file mode 160000 index 0bcc485f82b3..000000000000 --- a/third_party/intel_xpu_backend +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0bcc485f82b34d49494bd0264bacc24a20aafb7a