diff --git a/WORKSPACE b/WORKSPACE
index 9dcd29c45d6..412b1a7568f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -39,13 +39,11 @@ http_archive(
     patch_tool = "patch",
     patches = [
         "//openxla_patches:cache_urls.diff",
-        "//openxla_patches:cuda_graph.diff",
         "//openxla_patches:f16_abi_clang.diff",
         "//openxla_patches:gpu_race_condition.diff",
         "//openxla_patches:profiler_trace.diff",
         "//openxla_patches:stream_executor.diff",
         "//openxla_patches:topk_rewriter.diff",
-        "//openxla_patches:triton_filesystem.diff",
         "//openxla_patches:xla_bzl.diff",
     ],
     strip_prefix = "xla-ce3949f58d8a3a791c36741b248bea52954b1648",
diff --git a/openxla_patches/cuda_graph.diff b/openxla_patches/cuda_graph.diff
deleted file mode 100644
index 18b9acf5ca3..00000000000
--- a/openxla_patches/cuda_graph.diff
+++ /dev/null
@@ -1,194 +0,0 @@
-Revert 8aa20d47ea5c863b9edeb072489ffb7c38b6e8eb
-diff --git a/xla/service/gpu/runtime/BUILD b/xla/service/gpu/runtime/BUILD
-index 328a19690..e70b39c16 100644
---- a/xla/service/gpu/runtime/BUILD
-+++ b/xla/service/gpu/runtime/BUILD
-@@ -375,9 +375,7 @@ cc_library(
-         "//xla/stream_executor",
-         "@com_google_absl//absl/container:node_hash_map",
-         "@com_google_absl//absl/synchronization",
--    ] + if_cuda_is_configured([
--        "//xla/stream_executor/cuda:cuda_graph",
--    ]),
-+    ],
- )
- 
- cc_library(
-diff --git a/xla/service/gpu/runtime/executable.cc b/xla/service/gpu/runtime/executable.cc
-index a0919e1cf..75966a044 100644
---- a/xla/service/gpu/runtime/executable.cc
-+++ b/xla/service/gpu/runtime/executable.cc
-@@ -371,6 +371,7 @@ Status GpuRuntimeExecutable::Execute(
-       graph_instances_(executor)->snapshot();
-   CapturedFunctionExecutionCount::Snapshot execution_count =
-       captured_function_counts_(executor)->snapshot();
-+  CapturingCudaGraph capturing_cuda_graph(false);
- #endif  // GOOGLE_CUDA
- 
-   // Kernels in concurrent regions should be launched on borrowed stream, so
-@@ -398,7 +399,7 @@ Status GpuRuntimeExecutable::Execute(
-       &collectives_, &fft_plans, &send_recv_events, &gpu_lock,
- #if GOOGLE_CUDA
-       // Auxiliary data that is available only if compiled with CUDA support.
--      &matmul_plans, &graph_instances, &execution_count,
-+      &matmul_plans, &graph_instances, &execution_count, &capturing_cuda_graph,
- #endif  // GOOGLE_CUDA
-       &concurrent_region_status,
-       // Null pointer will be interpreted as an absence of async collectives
-diff --git a/xla/service/gpu/runtime/graph_launch.cc b/xla/service/gpu/runtime/graph_launch.cc
-index b54b9447f..1d60085e8 100644
---- a/xla/service/gpu/runtime/graph_launch.cc
-+++ b/xla/service/gpu/runtime/graph_launch.cc
-@@ -279,11 +279,18 @@ static absl::Status LaunchGraph(
-   // Compute the hash of the buffer arguments.
-   size_t ptrs_hash = absl::HashOf(RemainingArgsPtrs{fwd_args, temp_buffer});
- 
-+  CapturingCudaGraph not_capturing(false);
-+  CapturingCudaGraph capturing(true);
-   // Forwards user data required for launching kernels.
--  auto user_data = [&] {
-+  auto user_data_no_capture = [&] {
-     return CustomCall::UserData(run_options, debug_options, ptx, cubin,
-                                 temp_buffer, kernels, convs, executable,
--                                gemm_config, gpu_lock);
-+                                gemm_config, gpu_lock, &not_capturing);
-+  };
-+  auto user_data_capture = [&] {
-+    return CustomCall::UserData(run_options, debug_options, ptx, cubin,
-+                                temp_buffer, kernels, convs, executable,
-+                                gemm_config, gpu_lock, &capturing);
-   };
- 
-   absl::StatusOr<std::unique_ptr<std::atomic<uint64_t>>*> get_count =
-@@ -298,15 +305,16 @@ static absl::Status LaunchGraph(
-       debug_options->xla_gpu_cuda_graph_instantiation_threshold();
-   if (count < instantiation_threshold) {
-     // Run captured graph directly.
--    absl::Status result = RunGraphWithoutCapture(run_options, function_ref,
--                                                 fwd_args, user_data());
-+    absl::Status result = RunGraphWithoutCapture(
-+        run_options, function_ref, fwd_args, user_data_no_capture());
-     if (!result.ok()) return result;
-     return absl::OkStatus();
-   }
- 
-   absl::StatusOr<GraphInstance*> instance = instances->GetOrCreate(
-       capture.ordinal, [&]() -> absl::StatusOr<GraphInstance> {
--        auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
-+        auto g = CaptureGraph(run_options, function_ref, fwd_args,
-+                              user_data_capture());
-         if (!g.ok()) return g.status();
- 
-         auto e = se::gpu::InstantiateCudaGraph(std::move(*g));
-@@ -330,7 +338,8 @@ static absl::Status LaunchGraph(
-   VLOG(3) << "Update cached graph instance";
- 
-   // Capture CUDA graph by running capture function.
--  auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
-+  auto g =
-+      CaptureGraph(run_options, function_ref, fwd_args, user_data_capture());
-   if (!g.ok()) return g.status();
- 
-   // Update captured graph executable.
-diff --git a/xla/service/gpu/runtime/kernel_launch.cc b/xla/service/gpu/runtime/kernel_launch.cc
-index a498df8f5..3bc18912a 100644
---- a/xla/service/gpu/runtime/kernel_launch.cc
-+++ b/xla/service/gpu/runtime/kernel_launch.cc
-@@ -30,10 +30,6 @@ limitations under the License.
- #include "xla/service/service_executable_run_options.h"
- #include "xla/stream_executor/kernel.h"
- 
--#if GOOGLE_CUDA
--#include "xla/stream_executor/cuda/cuda_graph.h"
--#endif  // #if GOOGLE_CUDA
--
- namespace xla {
- namespace gpu {
- 
-@@ -54,6 +50,9 @@ StreamExecutorKernels* GpuExecutableKernels::operator()(
- static absl::Status LaunchImpl(
-     const ServiceExecutableRunOptions* run_options, const std::string* ptx,
-     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-+#if GOOGLE_CUDA
-+    CapturingCudaGraph* capturing_cuda_graph,
-+#endif
-     State<std::unique_ptr<se::KernelBase>> device_kernel,
-     int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
-     int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
-@@ -79,9 +78,7 @@ static absl::Status LaunchImpl(
-   assert((**kernel)->name() == name && "unexpected loaded kernel");
- 
- #if GOOGLE_CUDA
--  absl::StatusOr<bool> is_capturing = se::gpu::IsStreamCapturing(stream);
--  if (!is_capturing.ok()) return is_capturing.status();
--  if (is_capturing.value()) {
-+  if (capturing_cuda_graph->capturing()) {
-     VLOG(3) << "Launching " << (**kernel)->name()
-             << "during CUDA graph capture";
-   } else {
-@@ -127,6 +124,9 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-         .UserData<const std::string*>()
-         .UserData<const std::vector<uint8_t>*>()
-         .UserData<se::DeviceMemoryBase*>()
-+#if GOOGLE_CUDA
-+        .UserData<CapturingCudaGraph*>()
-+#endif
-         .State<std::unique_ptr<se::KernelBase>>("uid")
-         .Arg<int32_t>()   // shared_memory_bytes
-         .Arg<int32_t>()   // grid_size_x
-diff --git a/xla/service/gpu/runtime/support.h b/xla/service/gpu/runtime/support.h
-index 65148218f..423835913 100644
---- a/xla/service/gpu/runtime/support.h
-+++ b/xla/service/gpu/runtime/support.h
-@@ -116,6 +116,15 @@ inline void PopulateDotDimsAttrEncoding(
-           .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions));
- }
- 
-+class CapturingCudaGraph {
-+ public:
-+  explicit CapturingCudaGraph(bool capturing) : capturing_(capturing) {}
-+  bool capturing() { return capturing_; }
-+
-+ private:
-+  bool capturing_ = false;
-+};
-+
- }  // namespace gpu
- }  // namespace xla
- 
-diff --git a/xla/stream_executor/cuda/cuda_graph.cc b/xla/stream_executor/cuda/cuda_graph.cc
-index 2e6a1cdff..ed055a6b5 100644
---- a/xla/stream_executor/cuda/cuda_graph.cc
-+++ b/xla/stream_executor/cuda/cuda_graph.cc
-@@ -175,17 +175,5 @@ tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph) {
-   return OwnedCudaGraphExec(exec);
- }
- 
--tsl::StatusOr<bool> IsStreamCapturing(stream_executor::Stream* stream) {
--  cudaStreamCaptureStatus capture_status;
--  cudaError_t err = cudaStreamIsCapturing(
--      stream_executor::gpu::AsGpuStreamValue(stream), &capture_status);
--  if (err != cudaSuccess) {
--    return InternalError("Failed to get stream's capture status: %s",
--                         cudaGetErrorString(err));
--  }
--
--  return capture_status == cudaStreamCaptureStatusActive;
--}
--
- }  // namespace gpu
- }  // namespace stream_executor
-diff --git a/xla/stream_executor/cuda/cuda_graph.h b/xla/stream_executor/cuda/cuda_graph.h
-index cfe51436f..581f895cf 100644
---- a/xla/stream_executor/cuda/cuda_graph.h
-+++ b/xla/stream_executor/cuda/cuda_graph.h
-@@ -85,9 +85,6 @@ tsl::StatusOr<OwnedCudaGraph> CaptureCudaGraph(
- // Instantiates a captured cuda graph instance into a cuda graph executable.
- tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph);
- 
--// Returns true if the stream is in graph capture mode
--tsl::StatusOr<bool> IsStreamCapturing(stream_executor ::Stream* stream);
--
- }  // namespace gpu
- }  // namespace stream_executor
- 
\ No newline at end of file
diff --git a/openxla_patches/triton_filesystem.diff b/openxla_patches/triton_filesystem.diff
deleted file mode 100644
index cab6fbb2748..00000000000
--- a/openxla_patches/triton_filesystem.diff
+++ /dev/null
@@ -1,89 +0,0 @@
-This diff tries to fix a compilation error that std::filesystem cannot be found.
-diff --git a/third_party/triton/cl526173620.patch b/third_party/triton/cl526173620.patch
-index b1addd9b5..4113fc074 100644
---- a/third_party/triton/cl526173620.patch
-+++ b/third_party/triton/cl526173620.patch
-@@ -79,3 +79,82 @@
-        rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newRet);
-        return mlir::success();
-      }
-+
-+diff --git a/BUILD b/BUILD
-+index b6b2357..18be1fc 100644
-+--- a/BUILD
-++++ b/BUILD
-+@@ -277,6 +277,7 @@ cc_library(
-+     srcs = glob(["lib/Dialect/Triton/IR/*.cpp"]),
-+     hdrs = glob(["include/triton/Dialect/Triton/IR/*.h"]),
-+     copts = ["-Wno-unused-variable"],  # TODO(manany): fix
-++    linkopts = ["-lstdc++fs"],
-+     includes = ["include"],
-+     deps = [
-+         ":triton_dialect_inc_gen",
-+@@ -329,6 +330,7 @@ cc_library(
-+         "include/triton/Dialect/TritonGPU/IR/*.h",
-+     ]),
-+     copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
-++    linkopts = ["-lstdc++fs"],
-+     includes = ["include"],
-+     deps = [
-+         ":TritonDialect",
-+@@ -357,6 +359,7 @@ cc_library(
-+     ]),
-+     hdrs = glob(["include/triton/Dialect/TritonGPU/Transforms/*.h"]),
-+     copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
-++    linkopts = ["-lstdc++fs"],
-+     includes = ["include"],
-+     deps = [
-+         ":TritonDialect",
-+@@ -392,6 +395,7 @@ cc_library(
-+         "include/triton/Conversion/TritonGPUToLLVM/*.h",
-+     ]),
-+     copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
-++    linkopts = ["-lstdc++fs"],
-+     includes = [
-+         "include",
-+         "lib/Conversion/TritonGPUToLLVM",
-+diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
-+index a7ea05a..92bddc5 100644
-+--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
-++++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
-+@@ -26,7 +26,7 @@
-+ #include "llvm/Linker/Linker.h"
-+ #include "llvm/Support/SourceMgr.h"
-+ #include <dlfcn.h>
-+-#include <filesystem>
-++#include <experimental/filesystem>
-+ #include <iterator>
-+ 
-+ namespace mlir {
-+@@ -152,16 +152,16 @@ static std::map<std::string, std::string> getExternLibs(mlir::ModuleOp module) {
-+       externLibs.try_emplace(libdevice, env_path);
-+       return externLibs;
-+     }
-+-    namespace fs = std::filesystem;
-++    namespace fs = std::experimental::filesystem;
-+     // Search for libdevice relative to its library path if used from Python
-+     // Then native code is in `triton/_C/libtriton.so` and libdevice in
-+     // `triton/third_party/cuda/lib/libdevice.10.bc`
-+     static const auto this_library_path = [] {
-+       Dl_info fileinfo;
-+       if (dladdr(reinterpret_cast<void *>(&getExternLibs), &fileinfo) == 0) {
-+-        return std::filesystem::path();
-++        return std::experimental::filesystem::path();
-+       }
-+-      return std::filesystem::path(fileinfo.dli_fname);
-++      return std::experimental::filesystem::path(fileinfo.dli_fname);
-+     }();
-+     static const auto runtime_path =
-+         this_library_path.parent_path().parent_path() / "third_party" / "cuda" /
-+@@ -174,7 +174,7 @@ static std::map<std::string, std::string> getExternLibs(mlir::ModuleOp module) {
-+       // using its default path:
-+       // [triton root dir]/python/triton/language/libdevice.10.bc
-+       // TODO(Keren): handle external linkage other than libdevice?
-+-      static const auto this_file_path = std::filesystem::path(__FILE__);
-++      static const auto this_file_path = std::experimental::filesystem::path(__FILE__);
-+       static const auto compiletime_path = this_file_path.parent_path()
-+                                                .parent_path()
-+                                                .parent_path()
-\ No newline at end of file
\ No newline at end of file