diff --git a/WORKSPACE b/WORKSPACE index 9dcd29c45d6..412b1a7568f 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -39,13 +39,11 @@ http_archive( patch_tool = "patch", patches = [ "//openxla_patches:cache_urls.diff", - "//openxla_patches:cuda_graph.diff", "//openxla_patches:f16_abi_clang.diff", "//openxla_patches:gpu_race_condition.diff", "//openxla_patches:profiler_trace.diff", "//openxla_patches:stream_executor.diff", "//openxla_patches:topk_rewriter.diff", - "//openxla_patches:triton_filesystem.diff", "//openxla_patches:xla_bzl.diff", ], strip_prefix = "xla-ce3949f58d8a3a791c36741b248bea52954b1648", diff --git a/openxla_patches/cuda_graph.diff b/openxla_patches/cuda_graph.diff deleted file mode 100644 index 18b9acf5ca3..00000000000 --- a/openxla_patches/cuda_graph.diff +++ /dev/null @@ -1,194 +0,0 @@ -Revert 8aa20d47ea5c863b9edeb072489ffb7c38b6e8eb -diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD -index 328a19690..e70b39c16 100644 ---- a/xla/service/gpu/runtime/BUILD -+++ b/xla/service/gpu/runtime/BUILD -@@ -375,9 +375,7 @@ cc_library( - "//xla/stream_executor", - "@com_google_absl//absl/container:node_hash_map", - "@com_google_absl//absl/synchronization", -- ] + if_cuda_is_configured([ -- "//xla/stream_executor/cuda:cuda_graph", -- ]), -+ ], - ) - - cc_library( -diff --git a/xla/service/gpu/runtime/executable.cc b/xla/service/gpu/runtime/executable.cc -index a0919e1cf..75966a044 100644 ---- a/xla/service/gpu/runtime/executable.cc -+++ b/xla/service/gpu/runtime/executable.cc -@@ -371,6 +371,7 @@ Status GpuRuntimeExecutable::Execute( - graph_instances_(executor)->snapshot(); - CapturedFunctionExecutionCount::Snapshot execution_count = - captured_function_counts_(executor)->snapshot(); -+ CapturingCudaGraph capturing_cuda_graph(false); - #endif // GOOGLE_CUDA - - // Kernels in concurrent regions should be launched on borrowed stream, so -@@ -398,7 +399,7 @@ Status GpuRuntimeExecutable::Execute( - &collectives_, &fft_plans, &send_recv_events, &gpu_lock, - #if GOOGLE_CUDA - // Auxiliary data that is available only if compiled with CUDA support. -- &matmul_plans, &graph_instances, &execution_count, -+ &matmul_plans, &graph_instances, &execution_count, &capturing_cuda_graph, - #endif // GOOGLE_CUDA - &concurrent_region_status, - // Null pointer will be interpreted as an absence of async collectives -diff --git a/xla/service/gpu/runtime/graph_launch.cc b/xla/service/gpu/runtime/graph_launch.cc -index b54b9447f..1d60085e8 100644 ---- a/xla/service/gpu/runtime/graph_launch.cc -+++ b/xla/service/gpu/runtime/graph_launch.cc -@@ -279,11 +279,18 @@ static absl::Status LaunchGraph( - // Compute the hash of the buffer arguments. - size_t ptrs_hash = absl::HashOf(RemainingArgsPtrs{fwd_args, temp_buffer}); - -+ CapturingCudaGraph not_capturing(false); -+ CapturingCudaGraph capturing(true); - // Forwards user data required for launching kernels. -- auto user_data = [&] { -+ auto user_data_no_capture = [&] { - return CustomCall::UserData(run_options, debug_options, ptx, cubin, - temp_buffer, kernels, convs, executable, -- gemm_config, gpu_lock); -+ gemm_config, gpu_lock, ¬_capturing); -+ }; -+ auto user_data_capture = [&] { -+ return CustomCall::UserData(run_options, debug_options, ptx, cubin, -+ temp_buffer, kernels, convs, executable, -+ gemm_config, gpu_lock, &capturing); - }; - - absl::StatusOr>*> get_count = -@@ -298,15 +305,16 @@ static absl::Status LaunchGraph( - debug_options->xla_gpu_cuda_graph_instantiation_threshold(); - if (count < instantiation_threshold) { - // Run captured graph directly. -- absl::Status result = RunGraphWithoutCapture(run_options, function_ref, -- fwd_args, user_data()); -+ absl::Status result = RunGraphWithoutCapture( -+ run_options, function_ref, fwd_args, user_data_no_capture()); - if (!result.ok()) return result; - return absl::OkStatus(); - } - - absl::StatusOr instance = instances->GetOrCreate( - capture.ordinal, [&]() -> absl::StatusOr { -- auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data()); -+ auto g = CaptureGraph(run_options, function_ref, fwd_args, -+ user_data_capture()); - if (!g.ok()) return g.status(); - - auto e = se::gpu::InstantiateCudaGraph(std::move(*g)); -@@ -330,7 +338,8 @@ static absl::Status LaunchGraph( - VLOG(3) << "Update cached graph instance"; - - // Capture CUDA graph by running capture function. -- auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data()); -+ auto g = -+ CaptureGraph(run_options, function_ref, fwd_args, user_data_capture()); - if (!g.ok()) return g.status(); - - // Update captured graph executable. -diff --git a/xla/service/gpu/runtime/kernel_launch.cc b/xla/service/gpu/runtime/kernel_launch.cc -index a498df8f5..3bc18912a 100644 ---- a/xla/service/gpu/runtime/kernel_launch.cc -+++ b/xla/service/gpu/runtime/kernel_launch.cc -@@ -30,10 +30,6 @@ limitations under the License. - #include "xla/service/service_executable_run_options.h" - #include "xla/stream_executor/kernel.h" - --#if GOOGLE_CUDA --#include "xla/stream_executor/cuda/cuda_graph.h" --#endif // #if GOOGLE_CUDA -- - namespace xla { - namespace gpu { - -@@ -54,6 +50,9 @@ StreamExecutorKernels* GpuExecutableKernels::operator()( - static absl::Status LaunchImpl( - const ServiceExecutableRunOptions* run_options, const std::string* ptx, - const std::vector* cubin, se::DeviceMemoryBase* temp_buffer, -+#if GOOGLE_CUDA -+ CapturingCudaGraph* capturing_cuda_graph, -+#endif - State> device_kernel, - int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y, - int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y, -@@ -79,9 +78,7 @@ static absl::Status LaunchImpl( - assert((**kernel)->name() == name && "unexpected loaded kernel"); - - #if GOOGLE_CUDA -- absl::StatusOr is_capturing = se::gpu::IsStreamCapturing(stream); -- if (!is_capturing.ok()) return is_capturing.status(); -- if (is_capturing.value()) { -+ if (capturing_cuda_graph->capturing()) { - VLOG(3) << "Launching " << (**kernel)->name() - << "during CUDA graph capture"; - } else { -@@ -127,6 +124,9 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL( - .UserData() - .UserData*>() - .UserData() -+#if GOOGLE_CUDA -+ .UserData() -+#endif - .State>("uid") - .Arg() // shared_memory_bytes - .Arg() // grid_size_x -diff --git a/xla/service/gpu/runtime/support.h b/xla/service/gpu/runtime/support.h -index 65148218f..423835913 100644 ---- a/xla/service/gpu/runtime/support.h -+++ b/xla/service/gpu/runtime/support.h -@@ -116,6 +116,15 @@ inline void PopulateDotDimsAttrEncoding( - .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions)); - } - -+class CapturingCudaGraph { -+ public: -+ explicit CapturingCudaGraph(bool capturing) : capturing_(capturing) {} -+ bool capturing() { return capturing_; } -+ -+ private: -+ bool capturing_ = false; -+}; -+ - } // namespace gpu - } // namespace xla - -diff --git a/xla/stream_executor/cuda/cuda_graph.cc b/xla/stream_executor/cuda/cuda_graph.cc -index 2e6a1cdff..ed055a6b5 100644 ---- a/xla/stream_executor/cuda/cuda_graph.cc -+++ b/xla/stream_executor/cuda/cuda_graph.cc -@@ -175,17 +175,5 @@ tsl::StatusOr InstantiateCudaGraph(OwnedCudaGraph graph) { - return OwnedCudaGraphExec(exec); - } - --tsl::StatusOr IsStreamCapturing(stream_executor::Stream* stream) { -- cudaStreamCaptureStatus capture_status; -- cudaError_t err = cudaStreamIsCapturing( -- stream_executor::gpu::AsGpuStreamValue(stream), &capture_status); -- if (err != cudaSuccess) { -- return InternalError("Failed to get stream's capture status: %s", -- cudaGetErrorString(err)); -- } -- -- return capture_status == cudaStreamCaptureStatusActive; --} -- - } // namespace gpu - } // namespace stream_executor -diff --git a/xla/stream_executor/cuda/cuda_graph.h b/xla/stream_executor/cuda/cuda_graph.h -index cfe51436f..581f895cf 100644 ---- a/xla/stream_executor/cuda/cuda_graph.h -+++ b/xla/stream_executor/cuda/cuda_graph.h -@@ -85,9 +85,6 @@ tsl::StatusOr CaptureCudaGraph( - // Instantiates a captured cuda graph instance into a cuda graph executable. - tsl::StatusOr InstantiateCudaGraph(OwnedCudaGraph graph); - --// Returns true if the stream is in graph capture mode --tsl::StatusOr IsStreamCapturing(stream_executor ::Stream* stream); -- - } // namespace gpu - } // namespace stream_executor - \ No newline at end of file diff --git a/openxla_patches/triton_filesystem.diff b/openxla_patches/triton_filesystem.diff deleted file mode 100644 index cab6fbb2748..00000000000 --- a/openxla_patches/triton_filesystem.diff +++ /dev/null @@ -1,89 +0,0 @@ -This diff tries to fix a compilation error that std::filesystem cannot be found. -diff --git a/third_party/triton/cl526173620.patch b/third_party/triton/cl526173620.patch -index b1addd9b5..4113fc074 100644 ---- a/third_party/triton/cl526173620.patch -+++ b/third_party/triton/cl526173620.patch -@@ -79,3 +79,82 @@ - rewriter.replaceOpWithNewOp(op, newRet); - return mlir::success(); - } -+ -+diff --git a/BUILD b/BUILD -+index b6b2357..18be1fc 100644 -+--- a/BUILD -++++ b/BUILD -+@@ -277,6 +277,7 @@ cc_library( -+ srcs = glob(["lib/Dialect/Triton/IR/*.cpp"]), -+ hdrs = glob(["include/triton/Dialect/Triton/IR/*.h"]), -+ copts = ["-Wno-unused-variable"], # TODO(manany): fix -++ linkopts = ["-lstdc++fs"], -+ includes = ["include"], -+ deps = [ -+ ":triton_dialect_inc_gen", -+@@ -329,6 +330,7 @@ cc_library( -+ "include/triton/Dialect/TritonGPU/IR/*.h", -+ ]), -+ copts = ["-Wno-unused-variable"], # TODO(csigg): fix -++ linkopts = ["-lstdc++fs"], -+ includes = ["include"], -+ deps = [ -+ ":TritonDialect", -+@@ -357,6 +359,7 @@ cc_library( -+ ]), -+ hdrs = glob(["include/triton/Dialect/TritonGPU/Transforms/*.h"]), -+ copts = ["-Wno-unused-variable"], # TODO(csigg): fix -++ linkopts = ["-lstdc++fs"], -+ includes = ["include"], -+ deps = [ -+ ":TritonDialect", -+@@ -392,6 +395,7 @@ cc_library( -+ "include/triton/Conversion/TritonGPUToLLVM/*.h", -+ ]), -+ copts = ["-Wno-unused-variable"], # TODO(csigg): fix -++ linkopts = ["-lstdc++fs"], -+ includes = [ -+ "include", -+ "lib/Conversion/TritonGPUToLLVM", -+diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp -+index a7ea05a..92bddc5 100644 -+--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp -++++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp -+@@ -26,7 +26,7 @@ -+ #include "llvm/Linker/Linker.h" -+ #include "llvm/Support/SourceMgr.h" -+ #include -+-#include -++#include -+ #include -+ -+ namespace mlir { -+@@ -152,16 +152,16 @@ static std::map getExternLibs(mlir::ModuleOp module) { -+ externLibs.try_emplace(libdevice, env_path); -+ return externLibs; -+ } -+- namespace fs = std::filesystem; -++ namespace fs = std::experimental::filesystem; -+ // Search for libdevice relative to its library path if used from Python -+ // Then native code is in `triton/_C/libtriton.so` and libdevice in -+ // `triton/third_party/cuda/lib/libdevice.10.bc` -+ static const auto this_library_path = [] { -+ Dl_info fileinfo; -+ if (dladdr(reinterpret_cast(&getExternLibs), &fileinfo) == 0) { -+- return std::filesystem::path(); -++ return std::experimental::filesystem::path(); -+ } -+- return std::filesystem::path(fileinfo.dli_fname); -++ return std::experimental::filesystem::path(fileinfo.dli_fname); -+ }(); -+ static const auto runtime_path = -+ this_library_path.parent_path().parent_path() / "third_party" / "cuda" / -+@@ -174,7 +174,7 @@ static std::map getExternLibs(mlir::ModuleOp module) { -+ // using its default path: -+ // [triton root dir]/python/triton/language/libdevice.10.bc -+ // TODO(Keren): handle external linkage other than libdevice? -+- static const auto this_file_path = std::filesystem::path(__FILE__); -++ static const auto this_file_path = std::experimental::filesystem::path(__FILE__); -+ static const auto compiletime_path = this_file_path.parent_path() -+ .parent_path() -+ .parent_path() -\ No newline at end of file \ No newline at end of file