From 1ea5b0f0b28d69e205a05df04f774900f62e97e3 Mon Sep 17 00:00:00 2001
From: Ian Kettlewell <ian.kettlewell@gmail.com>
Date: Mon, 25 Mar 2024 14:29:42 -0400
Subject: [PATCH] Get Bevy building for WebAssembly with multithreading
 (#12205)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Objective

This gets Bevy building on Wasm when the `atomics` flag is enabled. This
does not yet multithread Bevy itself, but it allows Bevy users to use a
crate like `wasm_thread` to spawn their own threads and manually
parallelize work. This is a first step towards resolving #4078 . Also
fixes #9304.

This provides a foothold so that Bevy contributors can begin to think
about multithreaded Wasm's constraints and Bevy can work towards changes
to get the engine itself multithreaded.

Some flags need to be set on the Rust compiler when compiling for Wasm
multithreading. Here's what my build script looks like, with the correct
flags set, to test out Bevy examples on web:

```bash
set -e
RUSTFLAGS='-C target-feature=+atomics,+bulk-memory,+mutable-globals' \
     cargo build --example breakout --target wasm32-unknown-unknown -Z build-std=std,panic_abort --release
 wasm-bindgen --out-name wasm_example \
   --out-dir examples/wasm/target \
   --target web target/wasm32-unknown-unknown/release/examples/breakout.wasm
 devserver --header Cross-Origin-Opener-Policy='same-origin' --header Cross-Origin-Embedder-Policy='require-corp' --path examples/wasm
```

A few notes:

1. `cpal` crashes immediately when the `atomics` flag is set. That is
patched in https://github.com/RustAudio/cpal/pull/837, but not yet in
the latest crates.io release.

That can be temporarily worked around by patching Cpal like so:
```toml
[patch.crates-io]
cpal = { git = "https://github.com/RustAudio/cpal" }
```

2. When testing out `wasm_thread` you need to enable the `es_modules`
feature.

## Solution

The largest obstacle to compiling Bevy with `atomics` on web is that
`wgpu` types are _not_ Send and Sync. Longer term Bevy will need an
approach to handle that, but in the near term Bevy is already configured
to be single-threaded on web.

Therefor it is enough to wrap `wgpu` types in a
`send_wrapper::SendWrapper` that _is_ Send / Sync, but panics if
accessed off the `wgpu` thread.

---

## Changelog

- `wgpu` types that are not `Send` are wrapped in
`send_wrapper::SendWrapper` on Wasm + 'atomics'
- CommandBuffers are not generated in parallel on Wasm + 'atomics'

## Questions
- Bevy should probably add CI checks to make sure this doesn't regress.
Should that go in this PR or a separate PR? **Edit:** Added checks to
build Wasm with atomics

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: Alice Cecile <alice.i.cecile@gmail.com>
Co-authored-by: daxpedda <daxpedda@gmail.com>
Co-authored-by: François <francois.mockers@vleue.com>
---
 .github/workflows/ci.yml                      |  25 +++++
 crates/bevy_render/Cargo.toml                 |   7 +-
 crates/bevy_render/src/lib.rs                 |   3 +-
 .../src/render_resource/resource_macros.rs    |  40 ++++++-
 crates/bevy_render/src/renderer/mod.rs        | 103 ++++++++++++++----
 .../bevy_render/src/renderer/render_device.rs |   5 +-
 crates/bevy_render/src/view/window/mod.rs     |   6 +-
 7 files changed, 154 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d1d0909cac0097..fc4f2beeea01e1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -145,6 +145,31 @@ jobs:
       - name: Check wasm
         run: cargo check --target wasm32-unknown-unknown
 
+  build-wasm-atomics:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    needs: build
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+            target/
+          key: ubuntu-assets-cargo-build-wasm-nightly-${{ hashFiles('**/Cargo.toml') }}
+      - uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.NIGHTLY_TOOLCHAIN }}
+          targets: wasm32-unknown-unknown
+          components: rust-src
+      - name: Check wasm
+        run: cargo check --target wasm32-unknown-unknown -Z build-std=std,panic_abort
+        env:
+          RUSTFLAGS: "-C target-feature=+atomics,+bulk-memory"
+
   markdownlint:
     runs-on: ubuntu-latest
     timeout-minutes: 30
diff --git a/crates/bevy_render/Cargo.toml b/crates/bevy_render/Cargo.toml
index 4c856e5ab51165..0252550664fd02 100644
--- a/crates/bevy_render/Cargo.toml
+++ b/crates/bevy_render/Cargo.toml
@@ -66,7 +66,9 @@ image = { version = "0.24", default-features = false }
 # misc
 codespan-reporting = "0.11.0"
 # `fragile-send-sync-non-atomic-wasm` feature means we can't use WASM threads for rendering
-# It is enabled for now to avoid having to do a significant overhaul of the renderer just for wasm
+# It is enabled for now to avoid having to do a significant overhaul of the renderer just for wasm.
+# When the 'atomics' feature is enabled `fragile-send-sync-non-atomic` does nothing
+# and Bevy instead wraps `wgpu` types to verify they are not used off their origin thread.
 wgpu = { version = "0.19.3", default-features = false, features = [
   "wgsl",
   "dx12",
@@ -120,6 +122,9 @@ web-sys = { version = "0.3.67", features = [
 ] }
 wasm-bindgen = "0.2"
 
+[target.'cfg(all(target_arch = "wasm32", target_feature = "atomics"))'.dependencies]
+send_wrapper = "0.6.0"
+
 [lints]
 workspace = true
 
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 0f673c10388ef0..4290929f8b6703 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -58,6 +58,7 @@ use globals::GlobalsPlugin;
 use renderer::{RenderAdapter, RenderAdapterInfo, RenderDevice, RenderQueue};
 
 use crate::deterministic::DeterministicRenderingConfig;
+use crate::renderer::WgpuWrapper;
 use crate::{
     camera::CameraPlugin,
     mesh::{morph::MorphPlugin, Mesh, MeshPlugin},
@@ -301,7 +302,7 @@ impl Plugin for RenderPlugin {
                             queue,
                             adapter_info,
                             render_adapter,
-                            RenderInstance(Arc::new(instance)),
+                            RenderInstance(Arc::new(WgpuWrapper::new(instance))),
                         ));
                     };
                     // In wasm, spawn a task and detach it for execution
diff --git a/crates/bevy_render/src/render_resource/resource_macros.rs b/crates/bevy_render/src/render_resource/resource_macros.rs
index de2ea0ec00e58e..c027a92e2873f9 100644
--- a/crates/bevy_render/src/render_resource/resource_macros.rs
+++ b/crates/bevy_render/src/render_resource/resource_macros.rs
@@ -9,16 +9,25 @@
 #[macro_export]
 macro_rules! render_resource_wrapper {
     ($wrapper_type:ident, $wgpu_type:ty) => {
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         #[derive(Debug)]
         // SAFETY: while self is live, self.0 comes from `into_raw` of an Arc<$wgpu_type> with a strong ref.
         pub struct $wrapper_type(*const ());
 
+        #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+        #[derive(Debug)]
+        pub struct $wrapper_type(send_wrapper::SendWrapper<*const ()>);
+
         impl $wrapper_type {
             pub fn new(value: $wgpu_type) -> Self {
                 let arc = std::sync::Arc::new(value);
                 let value_ptr = std::sync::Arc::into_raw(arc);
                 let unit_ptr = value_ptr.cast::<()>();
-                Self(unit_ptr)
+
+                #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+                return Self(unit_ptr);
+                #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+                return Self(send_wrapper::SendWrapper::new(unit_ptr));
             }
 
             pub fn try_unwrap(self) -> Option<$wgpu_type> {
@@ -53,13 +62,16 @@ macro_rules! render_resource_wrapper {
             }
         }
 
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         // SAFETY: We manually implement Send and Sync, which is valid for Arc<T> when T: Send + Sync.
         // We ensure correctness by checking that $wgpu_type does implement Send and Sync.
         // If in future there is a case where a wrapper is required for a non-send/sync type
         // we can implement a macro variant that omits these manual Send + Sync impls
         unsafe impl Send for $wrapper_type {}
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         // SAFETY: As explained above, we ensure correctness by checking that $wgpu_type implements Send and Sync.
         unsafe impl Sync for $wrapper_type {}
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         const _: () = {
             trait AssertSendSyncBound: Send + Sync {}
             impl AssertSendSyncBound for $wgpu_type {}
@@ -75,7 +87,14 @@ macro_rules! render_resource_wrapper {
                 std::mem::forget(arc);
                 let cloned_value_ptr = std::sync::Arc::into_raw(cloned);
                 let cloned_unit_ptr = cloned_value_ptr.cast::<()>();
-                Self(cloned_unit_ptr)
+
+                #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+                return Self(cloned_unit_ptr);
+
+                // Note: this implementation means that this Clone will panic
+                // when called off the wgpu thread.
+                #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+                return Self(send_wrapper::SendWrapper::new(cloned_unit_ptr));
             }
         }
     };
@@ -85,16 +104,28 @@ macro_rules! render_resource_wrapper {
 #[macro_export]
 macro_rules! render_resource_wrapper {
     ($wrapper_type:ident, $wgpu_type:ty) => {
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         #[derive(Clone, Debug)]
         pub struct $wrapper_type(std::sync::Arc<$wgpu_type>);
+        #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+        #[derive(Clone, Debug)]
+        pub struct $wrapper_type(std::sync::Arc<send_wrapper::SendWrapper<$wgpu_type>>);
 
         impl $wrapper_type {
             pub fn new(value: $wgpu_type) -> Self {
-                Self(std::sync::Arc::new(value))
+                #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+                return Self(std::sync::Arc::new(value));
+
+                #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+                return Self(std::sync::Arc::new(send_wrapper::SendWrapper::new(value)));
             }
 
             pub fn try_unwrap(self) -> Option<$wgpu_type> {
-                std::sync::Arc::try_unwrap(self.0).ok()
+                #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+                return std::sync::Arc::try_unwrap(self.0).ok();
+
+                #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+                return std::sync::Arc::try_unwrap(self.0).ok().map(|p| p.take());
             }
         }
 
@@ -106,6 +137,7 @@ macro_rules! render_resource_wrapper {
             }
         }
 
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         const _: () = {
             trait AssertSendSyncBound: Send + Sync {}
             impl AssertSendSyncBound for $wgpu_type {}
diff --git a/crates/bevy_render/src/renderer/mod.rs b/crates/bevy_render/src/renderer/mod.rs
index 92eada8f238d4f..3f1620ae876fe6 100644
--- a/crates/bevy_render/src/renderer/mod.rs
+++ b/crates/bevy_render/src/renderer/mod.rs
@@ -117,23 +117,54 @@ pub fn render_system(world: &mut World, state: &mut SystemState<Query<Entity, Wi
     }
 }
 
+/// A wrapper to safely make `wgpu` types Send / Sync on web with atomics enabled.
+/// On web with `atomics` enabled the inner value can only be accessed
+/// or dropped on the `wgpu` thread or else a panic will occur.
+/// On other platforms the wrapper simply contains the wrapped value.
+#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+#[derive(Debug, Clone, Deref, DerefMut)]
+pub struct WgpuWrapper<T>(T);
+#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+#[derive(Debug, Clone, Deref, DerefMut)]
+pub struct WgpuWrapper<T>(send_wrapper::SendWrapper<T>);
+
+// SAFETY: SendWrapper is always Send + Sync.
+#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+unsafe impl<T> Send for WgpuWrapper<T> {}
+#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+unsafe impl<T> Sync for WgpuWrapper<T> {}
+
+#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+impl<T> WgpuWrapper<T> {
+    pub fn new(t: T) -> Self {
+        Self(t)
+    }
+}
+
+#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+impl<T> WgpuWrapper<T> {
+    pub fn new(t: T) -> Self {
+        Self(send_wrapper::SendWrapper::new(t))
+    }
+}
+
 /// This queue is used to enqueue tasks for the GPU to execute asynchronously.
 #[derive(Resource, Clone, Deref, DerefMut)]
-pub struct RenderQueue(pub Arc<Queue>);
+pub struct RenderQueue(pub Arc<WgpuWrapper<Queue>>);
 
 /// The handle to the physical device being used for rendering.
 /// See [`Adapter`] for more info.
 #[derive(Resource, Clone, Debug, Deref, DerefMut)]
-pub struct RenderAdapter(pub Arc<Adapter>);
+pub struct RenderAdapter(pub Arc<WgpuWrapper<Adapter>>);
 
 /// The GPU instance is used to initialize the [`RenderQueue`] and [`RenderDevice`],
 /// as well as to create [`WindowSurfaces`](crate::view::window::WindowSurfaces).
 #[derive(Resource, Clone, Deref, DerefMut)]
-pub struct RenderInstance(pub Arc<Instance>);
+pub struct RenderInstance(pub Arc<WgpuWrapper<Instance>>);
 
 /// The [`AdapterInfo`] of the adapter in use by the renderer.
 #[derive(Resource, Clone, Deref, DerefMut)]
-pub struct RenderAdapterInfo(pub AdapterInfo);
+pub struct RenderAdapterInfo(pub WgpuWrapper<AdapterInfo>);
 
 const GPU_NOT_FOUND_ERROR_MESSAGE: &str = if cfg!(target_os = "linux") {
     "Unable to find a GPU! Make sure you have installed required drivers! For extra information, see: https://github.com/bevyengine/bevy/blob/latest/docs/linux_dependencies.md"
@@ -300,12 +331,12 @@ pub async fn initialize_renderer(
         )
         .await
         .unwrap();
-    let queue = Arc::new(queue);
-    let adapter = Arc::new(adapter);
+    let queue = Arc::new(WgpuWrapper::new(queue));
+    let adapter = Arc::new(WgpuWrapper::new(adapter));
     (
         RenderDevice::from(device),
         RenderQueue(queue),
-        RenderAdapterInfo(adapter_info),
+        RenderAdapterInfo(WgpuWrapper::new(adapter_info)),
         RenderAdapter(adapter),
     )
 }
@@ -403,7 +434,10 @@ impl<'w> RenderContext<'w> {
     /// buffer.
     pub fn add_command_buffer_generation_task(
         &mut self,
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
         task: impl FnOnce(RenderDevice) -> CommandBuffer + 'w + Send,
+        #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+        task: impl FnOnce(RenderDevice) -> CommandBuffer + 'w,
     ) {
         self.flush_encoder();
 
@@ -425,28 +459,46 @@ impl<'w> RenderContext<'w> {
         self.flush_encoder();
 
         let mut command_buffers = Vec::with_capacity(self.command_buffer_queue.len());
-        let mut task_based_command_buffers = ComputeTaskPool::get().scope(|task_pool| {
-            for (i, queued_command_buffer) in self.command_buffer_queue.into_iter().enumerate() {
-                match queued_command_buffer {
-                    QueuedCommandBuffer::Ready(command_buffer) => {
-                        command_buffers.push((i, command_buffer));
-                    }
-                    QueuedCommandBuffer::Task(command_buffer_generation_task) => {
-                        let render_device = self.render_device.clone();
-                        if self.force_serial {
-                            command_buffers
-                                .push((i, command_buffer_generation_task(render_device)));
-                        } else {
-                            task_pool.spawn(async move {
-                                (i, command_buffer_generation_task(render_device))
-                            });
+
+        #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
+        {
+            let mut task_based_command_buffers = ComputeTaskPool::get().scope(|task_pool| {
+                for (i, queued_command_buffer) in self.command_buffer_queue.into_iter().enumerate()
+                {
+                    match queued_command_buffer {
+                        QueuedCommandBuffer::Ready(command_buffer) => {
+                            command_buffers.push((i, command_buffer));
+                        }
+                        QueuedCommandBuffer::Task(command_buffer_generation_task) => {
+                            let render_device = self.render_device.clone();
+                            if self.force_serial {
+                                command_buffers
+                                    .push((i, command_buffer_generation_task(render_device)));
+                            } else {
+                                task_pool.spawn(async move {
+                                    (i, command_buffer_generation_task(render_device))
+                                });
+                            }
                         }
                     }
                 }
+            });
+            command_buffers.append(&mut task_based_command_buffers);
+        }
+
+        #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+        for (i, queued_command_buffer) in self.command_buffer_queue.into_iter().enumerate() {
+            match queued_command_buffer {
+                QueuedCommandBuffer::Ready(command_buffer) => {
+                    command_buffers.push((i, command_buffer));
+                }
+                QueuedCommandBuffer::Task(command_buffer_generation_task) => {
+                    let render_device = self.render_device.clone();
+                    command_buffers.push((i, command_buffer_generation_task(render_device)));
+                }
             }
-        });
+        }
 
-        command_buffers.append(&mut task_based_command_buffers);
         command_buffers.sort_unstable_by_key(|(i, _)| *i);
 
         let mut command_buffers = command_buffers
@@ -481,5 +533,8 @@ impl<'w> RenderContext<'w> {
 
 enum QueuedCommandBuffer<'w> {
     Ready(CommandBuffer),
+    #[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
     Task(Box<dyn FnOnce(RenderDevice) -> CommandBuffer + 'w + Send>),
+    #[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
+    Task(Box<dyn FnOnce(RenderDevice) -> CommandBuffer + 'w>),
 }
diff --git a/crates/bevy_render/src/renderer/render_device.rs b/crates/bevy_render/src/renderer/render_device.rs
index 45bccf0bbe667e..1c0b26b912a42d 100644
--- a/crates/bevy_render/src/renderer/render_device.rs
+++ b/crates/bevy_render/src/renderer/render_device.rs
@@ -11,19 +11,20 @@ use wgpu::{
 use super::RenderQueue;
 
 use crate::render_resource::resource_macros::*;
+use crate::WgpuWrapper;
 
 render_resource_wrapper!(ErasedRenderDevice, wgpu::Device);
 
 /// This GPU device is responsible for the creation of most rendering and compute resources.
 #[derive(Resource, Clone)]
 pub struct RenderDevice {
-    device: ErasedRenderDevice,
+    device: WgpuWrapper<ErasedRenderDevice>,
 }
 
 impl From<wgpu::Device> for RenderDevice {
     fn from(device: wgpu::Device) -> Self {
         Self {
-            device: ErasedRenderDevice::new(device),
+            device: WgpuWrapper::new(ErasedRenderDevice::new(device)),
         }
     }
 }
diff --git a/crates/bevy_render/src/view/window/mod.rs b/crates/bevy_render/src/view/window/mod.rs
index 64d089cee0f7ca..ddb0f77f98aefe 100644
--- a/crates/bevy_render/src/view/window/mod.rs
+++ b/crates/bevy_render/src/view/window/mod.rs
@@ -4,7 +4,7 @@ use crate::{
     },
     renderer::{RenderAdapter, RenderDevice, RenderInstance},
     texture::TextureFormatPixelInfo,
-    Extract, ExtractSchedule, Render, RenderApp, RenderSet,
+    Extract, ExtractSchedule, Render, RenderApp, RenderSet, WgpuWrapper,
 };
 use bevy_app::{App, Plugin};
 use bevy_ecs::{entity::EntityHashMap, prelude::*};
@@ -198,7 +198,7 @@ fn extract_windows(
 
 struct SurfaceData {
     // TODO: what lifetime should this be?
-    surface: wgpu::Surface<'static>,
+    surface: WgpuWrapper<wgpu::Surface<'static>>,
     configuration: SurfaceConfiguration,
 }
 
@@ -488,7 +488,7 @@ pub fn create_surfaces(
                 render_device.configure_surface(&surface, &configuration);
 
                 SurfaceData {
-                    surface,
+                    surface: WgpuWrapper::new(surface),
                     configuration,
                 }
             });