diff --git a/WORKSPACE b/WORKSPACE index 073e270d76a6..d096cf8dd20f 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -41,6 +41,8 @@ http_archive( "//openxla_patches:cache_urls.diff", "//openxla_patches:constexpr_return.diff", "//openxla_patches:gpu_build_file.diff", + "//openxla_patches:gpu_race_condition.diff", + "//openxla_patches:f16_abi_clang.diff", ], strip_prefix = "xla-7a19856d74569fd1f765cd03bdee84e3b1fdc579", urls = [ diff --git a/openxla_patches/f16_abi_clang.diff b/openxla_patches/f16_abi_clang.diff new file mode 100644 index 000000000000..24cc8e5b74d5 --- /dev/null +++ b/openxla_patches/f16_abi_clang.diff @@ -0,0 +1,19 @@ +upstream CI will fail without this +diff --git a/xla/service/cpu/runtime_fp16.h b/xla/service/cpu/runtime_fp16.h +index 3f7af5197..ce4491c5d 100644 +--- a/xla/service/cpu/runtime_fp16.h ++++ b/xla/service/cpu/runtime_fp16.h +@@ -18,12 +18,7 @@ limitations under the License. + + #include + +-// _Float16 always gets us the correct ABI type, so use that if available. +-// AArch64 GCC defines __FLT16_MANT_DIG__ even when _Float16 is not available. +-#if defined(__FLT16_MANT_DIG__) && \ +- (defined(__clang__) || !(defined(__GNUC__) && defined(__aarch64__))) +-using XlaF16ABIType = _Float16; +-#elif defined(__x86_64__) ++#if defined(__x86_64__) + // Older versions of Clang don't have _Float16. Since both float and _Float16 + // are passed in the same register we can use the wider type and careful casting + // to conform to x86_64 psABI. This only works with the assumption that we're \ No newline at end of file diff --git a/openxla_patches/gpu_race_condition.diff b/openxla_patches/gpu_race_condition.diff new file mode 100644 index 000000000000..dfdc3aa74608 --- /dev/null +++ b/openxla_patches/gpu_race_condition.diff @@ -0,0 +1,14 @@ +diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc +index 242961dd1..787275868 100644 +--- a/xla/service/gpu/gpu_executable.cc ++++ b/xla/service/gpu/gpu_executable.cc +@@ -563,8 +563,7 @@ StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( + } + + // Force synchronous execution if the allocator requires it. +- const bool block_host_until_done = +- !memory_allocator->AllowsAsynchronousDeallocation(); ++ const bool block_host_until_done = true; + + + // Lock the GPU with a shared lock so that we don't interfere with autotuning \ No newline at end of file diff --git a/setup.py b/setup.py index f157fd728097..b3b39cd709e4 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,7 @@ base_dir = os.path.dirname(os.path.abspath(__file__)) -_libtpu_version = '0.1.dev20230825' +_libtpu_version = '0.1.dev20231003' _libtpu_storage_path = f'https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-{_libtpu_version}-py3-none-any.whl'