diff --git a/src/device/op128.h b/src/device/op128.h index 8ee0b4224..72f90d0c1 100644 --- a/src/device/op128.h +++ b/src/device/op128.h @@ -199,8 +199,7 @@ template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack } \ template<> \ __device__ __forceinline__ void st_##space(addr_cxx_ty addr, BytePack value) { \ - data_cxx_ty tmp = __builtin_nontemporal_load((data_cxx_ty *)&value.native); \ - __builtin_nontemporal_store(tmp, (data_cxx_ty *)addr); \ + __builtin_nontemporal_store(value.native, (data_cxx_ty *)addr); \ } // #if __CUDA_ARCH__ >= 700