diff --git a/WORKSPACE b/WORKSPACE
index 073e270d76a6..d096cf8dd20f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -41,6 +41,8 @@ http_archive(
         "//openxla_patches:cache_urls.diff",
         "//openxla_patches:constexpr_return.diff",
         "//openxla_patches:gpu_build_file.diff",
+        "//openxla_patches:gpu_race_condition.diff",
+        "//openxla_patches:f16_abi_clang.diff",
     ],
     strip_prefix = "xla-7a19856d74569fd1f765cd03bdee84e3b1fdc579",
     urls = [
diff --git a/openxla_patches/f16_abi_clang.diff b/openxla_patches/f16_abi_clang.diff
new file mode 100644
index 000000000000..24cc8e5b74d5
--- /dev/null
+++ b/openxla_patches/f16_abi_clang.diff
@@ -0,0 +1,19 @@
+upstream CI will fail without this
+diff --git a/xla/service/cpu/runtime_fp16.h b/xla/service/cpu/runtime_fp16.h
+index 3f7af5197..ce4491c5d 100644
+--- a/xla/service/cpu/runtime_fp16.h
++++ b/xla/service/cpu/runtime_fp16.h
+@@ -18,12 +18,7 @@ limitations under the License.
+ 
+ #include <stdint.h>
+ 
+-// _Float16 always gets us the correct ABI type, so use that if available.
+-// AArch64 GCC defines __FLT16_MANT_DIG__ even when _Float16 is not available.
+-#if defined(__FLT16_MANT_DIG__) && \
+-    (defined(__clang__) || !(defined(__GNUC__) && defined(__aarch64__)))
+-using XlaF16ABIType = _Float16;
+-#elif defined(__x86_64__)
++#if defined(__x86_64__)
+ // Older versions of Clang don't have _Float16. Since both float and _Float16
+ // are passed in the same register we can use the wider type and careful casting
+ // to conform to x86_64 psABI. This only works with the assumption that we're
\ No newline at end of file
diff --git a/openxla_patches/gpu_race_condition.diff b/openxla_patches/gpu_race_condition.diff
new file mode 100644
index 000000000000..dfdc3aa74608
--- /dev/null
+++ b/openxla_patches/gpu_race_condition.diff
@@ -0,0 +1,14 @@
+diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
+index 242961dd1..787275868 100644
+--- a/xla/service/gpu/gpu_executable.cc
++++ b/xla/service/gpu/gpu_executable.cc
+@@ -563,8 +563,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
+   }
+ 
+   // Force synchronous execution if the allocator requires it.
+-  const bool block_host_until_done =
+-      !memory_allocator->AllowsAsynchronousDeallocation();
++  const bool block_host_until_done = true;
+ 
+ 
+   // Lock the GPU with a shared lock so that we don't interfere with autotuning
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f157fd728097..b3b39cd709e4 100644
--- a/setup.py
+++ b/setup.py
@@ -72,7 +72,7 @@
 
 base_dir = os.path.dirname(os.path.abspath(__file__))
 
-_libtpu_version = '0.1.dev20230825'
+_libtpu_version = '0.1.dev20231003'
 _libtpu_storage_path = f'https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-{_libtpu_version}-py3-none-any.whl'