diff --git a/Cargo.lock b/Cargo.lock
index 1e2ae3c4c4..f66040ae51 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -582,6 +582,17 @@ version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3"
 
+[[package]]
+name = "futures-intrusive"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62007592ac46aa7c2b6416f7deb9a8a8f63a01e0f1d6e1787d5630170db2b63e"
+dependencies = [
+ "futures-core",
+ "lock_api",
+ "parking_lot 0.11.2",
+]
+
 [[package]]
 name = "futures-io"
 version = "0.3.21"
@@ -2039,6 +2050,7 @@ dependencies = [
  "console_log",
  "ddsfile",
  "env_logger",
+ "futures-intrusive",
  "glam",
  "js-sys",
  "log",
diff --git a/deno_webgpu/src/buffer.rs b/deno_webgpu/src/buffer.rs
index 23ba5ede9f..97934c0159 100644
--- a/deno_webgpu/src/buffer.rs
+++ b/deno_webgpu/src/buffer.rs
@@ -83,36 +83,27 @@ pub async fn op_webgpu_buffer_get_map_async(
             .get::<super::WebGpuDevice>(device_rid)?;
         device = device_resource.0;
 
-        let boxed_sender = Box::new(sender);
-        let sender_ptr = Box::into_raw(boxed_sender) as *mut u8;
-
-        extern "C" fn buffer_map_future_wrapper(
-            status: wgpu_core::resource::BufferMapAsyncStatus,
-            user_data: *mut u8,
-        ) {
-            let sender_ptr = user_data as *mut oneshot::Sender<Result<(), AnyError>>;
-            let boxed_sender = unsafe { Box::from_raw(sender_ptr) };
-            boxed_sender
+        let callback = Box::new(move |status| {
+            sender
                 .send(match status {
                     wgpu_core::resource::BufferMapAsyncStatus::Success => Ok(()),
                     _ => unreachable!(), // TODO
                 })
                 .unwrap();
-        }
+        });
 
         // TODO(lucacasonato): error handling
         let maybe_err = gfx_select!(buffer => instance.buffer_map_async(
-          buffer,
-          offset..(offset + size),
-          wgpu_core::resource::BufferMapOperation {
-            host: match mode {
-              1 => wgpu_core::device::HostMap::Read,
-              2 => wgpu_core::device::HostMap::Write,
-              _ => unreachable!(),
-            },
-            callback: buffer_map_future_wrapper,
-            user_data: sender_ptr,
-          }
+            buffer,
+            offset..(offset + size),
+            wgpu_core::resource::BufferMapOperation {
+                host: match mode {
+                    1 => wgpu_core::device::HostMap::Read,
+                    2 => wgpu_core::device::HostMap::Write,
+                    _ => unreachable!(),
+                },
+                callback: wgpu_core::resource::BufferMapCallback::from_rust(callback),
+            }
         ))
         .err();
 
diff --git a/deno_webgpu/src/lib.rs b/deno_webgpu/src/lib.rs
index 9e3a9af158..25de7f177c 100644
--- a/deno_webgpu/src/lib.rs
+++ b/deno_webgpu/src/lib.rs
@@ -128,6 +128,9 @@ fn deserialize_features(features: &wgpu_types::Features) -> Vec<&'static str> {
     if features.contains(wgpu_types::Features::DEPTH_CLIP_CONTROL) {
         return_features.push("depth-clip-control");
     }
+    if features.contains(wgpu_types::Features::DEPTH24UNORM_STENCIL8) {
+        return_features.push("depth24unorm-stencil8");
+    }
     if features.contains(wgpu_types::Features::DEPTH32FLOAT_STENCIL8) {
         return_features.push("depth32float-stencil8");
     }
@@ -284,6 +287,10 @@ impl From<GpuRequiredFeatures> for wgpu_types::Features {
         features.set(
             wgpu_types::Features::DEPTH_CLIP_CONTROL,
             required_features.0.contains("depth-clip-control"),
+        );
+         features.set(
+            wgpu_types::Features::DEPTH24UNORM_STENCIL8,
+            required_features.0.contains("depth24unorm-stencil8"),
         );
         features.set(
             wgpu_types::Features::DEPTH32FLOAT_STENCIL8,
diff --git a/player/src/bin/play.rs b/player/src/bin/play.rs
index 9b5722490c..d973ec48c0 100644
--- a/player/src/bin/play.rs
+++ b/player/src/bin/play.rs
@@ -181,7 +181,7 @@ fn main() {
                 },
                 Event::LoopDestroyed => {
                     log::info!("Closing");
-                    gfx_select!(device => global.device_poll(device, true, None)).unwrap();
+                    gfx_select!(device => global.device_poll(device, wgt::Maintain::Wait)).unwrap();
                 }
                 _ => {}
             }
diff --git a/player/tests/data/pipeline-statistics-query.ron b/player/tests/data/pipeline-statistics-query.ron
index 28c025b743..0975d8e126 100644
--- a/player/tests/data/pipeline-statistics-query.ron
+++ b/player/tests/data/pipeline-statistics-query.ron
@@ -1,5 +1,5 @@
 (
-    features: 0x0000_0000_0000_0080, // PIPELINE_STATISTICS_QUERY
+    features: 0x0000_0000_0000_0100, // PIPELINE_STATISTICS_QUERY
     expectations: [
         (
             name: "Queried number of compute invocations is correct",
diff --git a/player/tests/test.rs b/player/tests/test.rs
index 7d1c156a26..0ced0fad8c 100644
--- a/player/tests/test.rs
+++ b/player/tests/test.rs
@@ -14,7 +14,7 @@ use std::{
     fs::{read_to_string, File},
     io::{Read, Seek, SeekFrom},
     path::{Path, PathBuf},
-    ptr, slice,
+    slice,
 };
 
 #[derive(serde::Deserialize)]
@@ -55,7 +55,7 @@ struct Test<'a> {
     actions: Vec<wgc::device::trace::Action<'a>>,
 }
 
-extern "C" fn map_callback(status: wgc::resource::BufferMapAsyncStatus, _user_data: *mut u8) {
+fn map_callback(status: wgc::resource::BufferMapAsyncStatus) {
     match status {
         wgc::resource::BufferMapAsyncStatus::Success => (),
         _ => panic!("Unable to map"),
@@ -112,8 +112,9 @@ impl Test<'_> {
                 expect.offset .. expect.offset+expect.data.len() as wgt::BufferAddress,
                 wgc::resource::BufferMapOperation {
                     host: wgc::device::HostMap::Read,
-                    callback: map_callback,
-                    user_data: ptr::null_mut(),
+                    callback: wgc::resource::BufferMapCallback::from_rust(
+                        Box::new(map_callback)
+                    ),
                 }
             ))
             .unwrap();
diff --git a/wgpu-core/LICENSE.APACHE b/wgpu-core/LICENSE.APACHE
new file mode 120000
index 0000000000..7141cad5b2
--- /dev/null
+++ b/wgpu-core/LICENSE.APACHE
@@ -0,0 +1 @@
+../LICENSE.APACHE
\ No newline at end of file
diff --git a/wgpu-core/LICENSE.MIT b/wgpu-core/LICENSE.MIT
new file mode 120000
index 0000000000..6b8772d1a7
--- /dev/null
+++ b/wgpu-core/LICENSE.MIT
@@ -0,0 +1 @@
+../LICENSE.MIT
\ No newline at end of file
diff --git a/wgpu-core/src/command/draw.rs b/wgpu-core/src/command/draw.rs
index 2206f3f204..d5fa612fa6 100644
--- a/wgpu-core/src/command/draw.rs
+++ b/wgpu-core/src/command/draw.rs
@@ -87,10 +87,12 @@ pub enum RenderCommandError {
     MissingTextureUsage(#[from] MissingTextureUsageError),
     #[error(transparent)]
     PushConstants(#[from] PushConstantUploadError),
-    #[error("Invalid Viewport parameters")]
-    InvalidViewport,
-    #[error("Invalid ScissorRect parameters")]
-    InvalidScissorRect,
+    #[error("Viewport width {0} and/or height {1} are less than or equal to 0")]
+    InvalidViewportDimension(f32, f32),
+    #[error("Viewport minDepth {0} and/or maxDepth {1} are not in [0, 1]")]
+    InvalidViewportDepth(f32, f32),
+    #[error("Scissor {0:?} is not contained in the render target {1:?}")]
+    InvalidScissorRect(Rect<u32>, wgt::Extent3d),
     #[error("Support for {0} is not implemented yet")]
     Unimplemented(&'static str),
 }
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index 7ef45aa0a6..9443bea895 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -1451,14 +1451,17 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                         depth_max,
                     } => {
                         let scope = PassErrorScope::SetViewport;
-                        if rect.w <= 0.0
-                            || rect.h <= 0.0
-                            || depth_min < 0.0
-                            || depth_min > 1.0
-                            || depth_max < 0.0
-                            || depth_max > 1.0
-                        {
-                            return Err(RenderCommandError::InvalidViewport).map_pass_err(scope);
+                        if rect.w <= 0.0 || rect.h <= 0.0 {
+                            return Err(RenderCommandError::InvalidViewportDimension(
+                                rect.w, rect.h,
+                            ))
+                            .map_pass_err(scope);
+                        }
+                        if !(0.0..=1.0).contains(&depth_min) || !(0.0..=1.0).contains(&depth_max) {
+                            return Err(RenderCommandError::InvalidViewportDepth(
+                                depth_min, depth_max,
+                            ))
+                            .map_pass_err(scope);
                         }
                         let r = hal::Rect {
                             x: rect.x,
@@ -1510,7 +1513,8 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                             || rect.x + rect.w > info.extent.width
                             || rect.y + rect.h > info.extent.height
                         {
-                            return Err(RenderCommandError::InvalidScissorRect).map_pass_err(scope);
+                            return Err(RenderCommandError::InvalidScissorRect(*rect, info.extent))
+                                .map_pass_err(scope);
                         }
                         let r = hal::Rect {
                             x: rect.x,
diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs
index 998263b44e..2a0f9a9558 100644
--- a/wgpu-core/src/command/transfer.rs
+++ b/wgpu-core/src/command/transfer.rs
@@ -313,7 +313,8 @@ pub(crate) fn validate_texture_copy_range(
         wgt::TextureFormat::Depth32Float
         | wgt::TextureFormat::Depth32FloatStencil8
         | wgt::TextureFormat::Depth24Plus
-        | wgt::TextureFormat::Depth24PlusStencil8 => {
+        | wgt::TextureFormat::Depth24PlusStencil8
+        | wgt::TextureFormat::Depth24UnormStencil8 => {
             if *copy_size != extent {
                 return Err(TransferError::InvalidDepthTextureExtent);
             }
diff --git a/wgpu-core/src/conv.rs b/wgpu-core/src/conv.rs
index 9ffa956aa4..9ab3cbbecb 100644
--- a/wgpu-core/src/conv.rs
+++ b/wgpu-core/src/conv.rs
@@ -19,9 +19,11 @@ pub fn is_valid_copy_src_texture_format(format: wgt::TextureFormat) -> bool {
 pub fn is_valid_copy_dst_texture_format(format: wgt::TextureFormat) -> bool {
     use wgt::TextureFormat as Tf;
     match format {
-        Tf::Depth32Float | Tf::Depth32FloatStencil8 | Tf::Depth24Plus | Tf::Depth24PlusStencil8 => {
-            false
-        }
+        Tf::Depth32Float
+        | Tf::Depth32FloatStencil8
+        | Tf::Depth24Plus
+        | Tf::Depth24PlusStencil8
+        | Tf::Depth24UnormStencil8 => false,
         _ => true,
     }
 }
diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs
index 27f1708860..3f8cd6e7cf 100644
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -442,15 +442,18 @@ impl<A: hal::Api> LifetimeTracker<A> {
         }
     }
 
-    pub fn add_work_done_closure(&mut self, closure: SubmittedWorkDoneClosure) -> bool {
+    pub fn add_work_done_closure(
+        &mut self,
+        closure: SubmittedWorkDoneClosure,
+    ) -> Option<SubmittedWorkDoneClosure> {
         match self.active.last_mut() {
             Some(active) => {
                 active.work_done_closures.push(closure);
-                true
+                None
             }
             // Note: we can't immediately invoke the closure, since it assumes
             // nothing is currently locked in the hubs.
-            None => false,
+            None => Some(closure),
         }
     }
 }
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 35ae13e7e7..8a9eb0fbbe 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -141,14 +141,14 @@ impl UserClosures {
         self.submissions.extend(other.submissions);
     }
 
-    unsafe fn fire(self) {
-        //Note: this logic is specifically moved out of `handle_mapping()` in order to
+    fn fire(self) {
+        // Note: this logic is specifically moved out of `handle_mapping()` in order to
         // have nothing locked by the time we execute users callback code.
         for (operation, status) in self.mappings {
-            (operation.callback)(status, operation.user_data);
+            operation.callback.call(status);
         }
         for closure in self.submissions {
-            (closure.callback)(closure.user_data);
+            closure.call();
         }
     }
 }
@@ -1027,16 +1027,6 @@ impl<A: HalApi> Device<A> {
             format_features: texture.format_features,
             extent,
             samples: texture.desc.sample_count,
-            // once a storage - forever a storage
-            sampled_internal_use: if texture
-                .desc
-                .usage
-                .contains(wgt::TextureUsages::STORAGE_BINDING)
-            {
-                hal::TextureUses::RESOURCE | hal::TextureUses::STORAGE_READ
-            } else {
-                hal::TextureUses::RESOURCE
-            },
             selector,
             life_guard: LifeGuard::new(desc.label.borrow_or_default()),
         })
@@ -2006,7 +1996,7 @@ impl<A: HalApi> Device<A> {
                 }
                 Ok((
                     wgt::TextureUsages::TEXTURE_BINDING,
-                    view.sampled_internal_use,
+                    hal::TextureUses::RESOURCE,
                 ))
             }
             wgt::BindingType::StorageTexture {
@@ -4997,9 +4987,9 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 .map_err(|_| DeviceError::Invalid)?
                 .maintain(hub, maintain, &mut token)?
         };
-        unsafe {
-            closures.fire();
-        }
+
+        closures.fire();
+
         Ok(queue_empty)
     }
 
@@ -5082,9 +5072,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 self.poll_devices::<hal::api::Gles>(force_wait, &mut closures)? && all_queue_empty;
         }
 
-        unsafe {
-            closures.fire();
-        }
+        closures.fire();
 
         Ok(all_queue_empty)
     }
@@ -5191,7 +5179,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                     return Err(resource::BufferAccessError::AlreadyMapped);
                 }
                 resource::BufferMapState::Waiting(_) => {
-                    op.call_error();
+                    op.callback.call_error();
                     return Ok(());
                 }
                 resource::BufferMapState::Idle => {
@@ -5408,9 +5396,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         //Note: outside inner function so no locks are held when calling the callback
         let closure = self.buffer_unmap_inner::<A>(buffer_id)?;
         if let Some((operation, status)) = closure {
-            unsafe {
-                (operation.callback)(status, operation.user_data);
-            }
+            operation.callback.call(status);
         }
         Ok(())
     }
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 8b1a662272..6e0f589cae 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -28,16 +28,56 @@ use thiserror::Error;
 /// without a concrete moment of when it can be cleared.
 const WRITE_COMMAND_BUFFERS_PER_POOL: usize = 64;
 
-pub type OnSubmittedWorkDoneCallback = unsafe extern "C" fn(user_data: *mut u8);
 #[repr(C)]
-#[derive(Clone, Copy, Debug)]
+pub struct SubmittedWorkDoneClosureC {
+    callback: unsafe extern "C" fn(user_data: *mut u8),
+    user_data: *mut u8,
+}
+
+unsafe impl Send for SubmittedWorkDoneClosureC {}
+
 pub struct SubmittedWorkDoneClosure {
-    pub callback: OnSubmittedWorkDoneCallback,
-    pub user_data: *mut u8,
+    // We wrap this so creating the enum in the C variant can be unsafe,
+    // allowing our call function to be safe.
+    inner: SubmittedWorkDoneClosureInner,
+}
+
+enum SubmittedWorkDoneClosureInner {
+    Rust {
+        callback: Box<dyn FnOnce() + Send + 'static>,
+    },
+    C {
+        inner: SubmittedWorkDoneClosureC,
+    },
 }
 
-unsafe impl Send for SubmittedWorkDoneClosure {}
-unsafe impl Sync for SubmittedWorkDoneClosure {}
+impl SubmittedWorkDoneClosure {
+    pub fn from_rust(callback: Box<dyn FnOnce() + Send + 'static>) -> Self {
+        Self {
+            inner: SubmittedWorkDoneClosureInner::Rust { callback },
+        }
+    }
+
+    /// # Safety
+    ///
+    /// - The callback pointer must be valid to call with the provided user_data pointer.
+    /// - Both pointers must point to 'static data as the callback may happen at an unspecified time.
+    pub unsafe fn from_c(inner: SubmittedWorkDoneClosureC) -> Self {
+        Self {
+            inner: SubmittedWorkDoneClosureInner::C { inner },
+        }
+    }
+
+    pub(crate) fn call(self) {
+        match self.inner {
+            SubmittedWorkDoneClosureInner::Rust { callback } => callback(),
+            // SAFETY: the contract of the call to from_c says that this unsafe is sound.
+            SubmittedWorkDoneClosureInner::C { inner } => unsafe {
+                (inner.callback)(inner.user_data)
+            },
+        }
+    }
+}
 
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -940,9 +980,8 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         };
 
         // the closures should execute with nothing locked!
-        unsafe {
-            callbacks.fire();
-        }
+        callbacks.fire();
+
         Ok(WrappedSubmissionIndex {
             queue_id,
             index: submit_index,
@@ -968,7 +1007,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         closure: SubmittedWorkDoneClosure,
     ) -> Result<(), InvalidQueue> {
         //TODO: flush pending writes
-        let added = {
+        let closure_opt = {
             let hub = A::hub(self);
             let mut token = Token::root();
             let (device_guard, mut token) = hub.devices.read(&mut token);
@@ -977,10 +1016,8 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 Err(_) => return Err(InvalidQueue),
             }
         };
-        if !added {
-            unsafe {
-                (closure.callback)(closure.user_data);
-            }
+        if let Some(closure) = closure_opt {
+            closure.call();
         }
         Ok(())
     }
diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs
index a47e064f44..6f45e26399 100644
--- a/wgpu-core/src/resource.rs
+++ b/wgpu-core/src/resource.rs
@@ -23,7 +23,6 @@ pub enum BufferMapAsyncStatus {
     ContextLost,
 }
 
-#[derive(Debug)]
 pub(crate) enum BufferMapState<A: hal::Api> {
     /// Mapped at creation.
     Init {
@@ -46,29 +45,67 @@ pub(crate) enum BufferMapState<A: hal::Api> {
 unsafe impl<A: hal::Api> Send for BufferMapState<A> {}
 unsafe impl<A: hal::Api> Sync for BufferMapState<A> {}
 
-pub type BufferMapCallback = unsafe extern "C" fn(status: BufferMapAsyncStatus, userdata: *mut u8);
-
 #[repr(C)]
-#[derive(Debug)]
-pub struct BufferMapOperation {
-    pub host: HostMap,
-    pub callback: BufferMapCallback,
-    pub user_data: *mut u8,
+pub struct BufferMapCallbackC {
+    callback: unsafe extern "C" fn(status: BufferMapAsyncStatus, user_data: *mut u8),
+    user_data: *mut u8,
 }
 
-//TODO: clarify if/why this is needed here
-unsafe impl Send for BufferMapOperation {}
-unsafe impl Sync for BufferMapOperation {}
+unsafe impl Send for BufferMapCallbackC {}
+
+pub struct BufferMapCallback {
+    // We wrap this so creating the enum in the C variant can be unsafe,
+    // allowing our call function to be safe.
+    inner: BufferMapCallbackInner,
+}
+
+enum BufferMapCallbackInner {
+    Rust {
+        callback: Box<dyn FnOnce(BufferMapAsyncStatus) + Send + 'static>,
+    },
+    C {
+        inner: BufferMapCallbackC,
+    },
+}
+
+impl BufferMapCallback {
+    pub fn from_rust(callback: Box<dyn FnOnce(BufferMapAsyncStatus) + Send + 'static>) -> Self {
+        Self {
+            inner: BufferMapCallbackInner::Rust { callback },
+        }
+    }
+
+    /// # Safety
+    ///
+    /// - The callback pointer must be valid to call with the provided user_data pointer.
+    /// - Both pointers must point to 'static data as the callback may happen at an unspecified time.
+    pub unsafe fn from_c(inner: BufferMapCallbackC) -> Self {
+        Self {
+            inner: BufferMapCallbackInner::C { inner },
+        }
+    }
+
+    pub(crate) fn call(self, status: BufferMapAsyncStatus) {
+        match self.inner {
+            BufferMapCallbackInner::Rust { callback } => callback(status),
+            // SAFETY: the contract of the call to from_c says that this unsafe is sound.
+            BufferMapCallbackInner::C { inner } => unsafe {
+                (inner.callback)(status, inner.user_data)
+            },
+        }
+    }
 
-impl BufferMapOperation {
     pub(crate) fn call_error(self) {
         log::error!("wgpu_buffer_map_async failed: buffer mapping is pending");
-        unsafe {
-            (self.callback)(BufferMapAsyncStatus::Error, self.user_data);
-        }
+        self.call(BufferMapAsyncStatus::Error);
     }
 }
 
+pub struct BufferMapOperation {
+    pub host: HostMap,
+    pub callback: BufferMapCallback,
+}
+
 #[derive(Clone, Debug, Error)]
 pub enum BufferAccessError {
     #[error(transparent)]
@@ -105,7 +142,6 @@ pub enum BufferAccessError {
     },
 }
 
-#[derive(Debug)]
 pub(crate) struct BufferPendingMapping {
     pub range: Range<wgt::BufferAddress>,
     pub op: BufferMapOperation,
@@ -115,7 +151,6 @@ pub(crate) struct BufferPendingMapping {
 
 pub type BufferDescriptor<'a> = wgt::BufferDescriptor<Label<'a>>;
 
-#[derive(Debug)]
 pub struct Buffer<A: hal::Api> {
     pub(crate) raw: Option<A::Buffer>,
     pub(crate) device_id: Stored<DeviceId>,
@@ -385,8 +420,6 @@ pub struct TextureView<A: hal::Api> {
     pub(crate) format_features: wgt::TextureFormatFeatures,
     pub(crate) extent: wgt::Extent3d,
     pub(crate) samples: u32,
-    /// Internal use of this texture view when used as `BindingType::Texture`.
-    pub(crate) sampled_internal_use: hal::TextureUses,
     pub(crate) selector: TextureSelector,
     pub(crate) life_guard: LifeGuard,
 }
diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs
index 148eeecfa5..0264edb29d 100644
--- a/wgpu-core/src/validation.rs
+++ b/wgpu-core/src/validation.rs
@@ -263,6 +263,8 @@ pub enum StageError {
         #[source]
         error: InputError,
     },
+    #[error("location[{location}] is provided by the previous stage output but is not consumed as input by this stage.")]
+    InputNotConsumed { location: wgt::ShaderLocation },
 }
 
 fn map_storage_format_to_naga(format: wgt::TextureFormat) -> Option<naga::StorageFormat> {
@@ -705,7 +707,8 @@ impl NumericType {
             Tf::Depth32Float
             | Tf::Depth32FloatStencil8
             | Tf::Depth24Plus
-            | Tf::Depth24PlusStencil8 => {
+            | Tf::Depth24PlusStencil8
+            | Tf::Depth24UnormStencil8 => {
                 panic!("Unexpected depth format")
             }
             Tf::Rgb9e5Ufloat => (NumericDimension::Vector(Vs::Tri), Sk::Float),
@@ -1158,6 +1161,21 @@ impl Interface {
             }
         }
 
+        // Check all vertex outputs and make sure the fragment shader consumes them.
+        if shader_stage == naga::ShaderStage::Fragment {
+            for &index in inputs.keys() {
+                // This is a linear scan, but the count should be low enough that this should be fine.
+                let found = entry_point.inputs.iter().any(|v| match *v {
+                    Varying::Local { location, .. } => location == index,
+                    Varying::BuiltIn(_) => false,
+                });
+
+                if !found {
+                    return Err(StageError::InputNotConsumed { location: index });
+                }
+            }
+        }
+
         if shader_stage == naga::ShaderStage::Vertex {
             for output in entry_point.outputs.iter() {
                 //TODO: count builtins towards the limit?
diff --git a/wgpu-hal/LICENSE.APACHE b/wgpu-hal/LICENSE.APACHE
new file mode 120000
index 0000000000..7141cad5b2
--- /dev/null
+++ b/wgpu-hal/LICENSE.APACHE
@@ -0,0 +1 @@
+../LICENSE.APACHE
\ No newline at end of file
diff --git a/wgpu-hal/LICENSE.MIT b/wgpu-hal/LICENSE.MIT
new file mode 120000
index 0000000000..6b8772d1a7
--- /dev/null
+++ b/wgpu-hal/LICENSE.MIT
@@ -0,0 +1 @@
+../LICENSE.MIT
\ No newline at end of file
diff --git a/wgpu-hal/src/auxil/dxgi/conv.rs b/wgpu-hal/src/auxil/dxgi/conv.rs
index 2fd9bf04ca..384775c3e0 100644
--- a/wgpu-hal/src/auxil/dxgi/conv.rs
+++ b/wgpu-hal/src/auxil/dxgi/conv.rs
@@ -49,7 +49,7 @@ pub fn map_texture_format(format: wgt::TextureFormat) -> dxgiformat::DXGI_FORMAT
         Tf::Depth32Float => DXGI_FORMAT_D32_FLOAT,
         Tf::Depth32FloatStencil8 => DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
         Tf::Depth24Plus => DXGI_FORMAT_D24_UNORM_S8_UINT,
-        Tf::Depth24PlusStencil8 => DXGI_FORMAT_D24_UNORM_S8_UINT,
+        Tf::Depth24PlusStencil8 | Tf::Depth24UnormStencil8 => DXGI_FORMAT_D24_UNORM_S8_UINT,
         Tf::Rgb9e5Ufloat => DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
         Tf::Bc1RgbaUnorm => DXGI_FORMAT_BC1_UNORM,
         Tf::Bc1RgbaUnormSrgb => DXGI_FORMAT_BC1_UNORM_SRGB,
@@ -100,9 +100,9 @@ pub fn map_texture_format_nodepth(format: wgt::TextureFormat) -> dxgiformat::DXG
         wgt::TextureFormat::Depth32FloatStencil8 => {
             dxgiformat::DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS
         }
-        wgt::TextureFormat::Depth24Plus | wgt::TextureFormat::Depth24PlusStencil8 => {
-            dxgiformat::DXGI_FORMAT_R24_UNORM_X8_TYPELESS
-        }
+        wgt::TextureFormat::Depth24Plus
+        | wgt::TextureFormat::Depth24PlusStencil8
+        | wgt::TextureFormat::Depth24UnormStencil8 => dxgiformat::DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
         _ => {
             assert_eq!(
                 crate::FormatAspects::from(format),
@@ -117,9 +117,9 @@ pub fn map_texture_format_depth_typeless(format: wgt::TextureFormat) -> dxgiform
     match format {
         wgt::TextureFormat::Depth32Float => dxgiformat::DXGI_FORMAT_R32_TYPELESS,
         wgt::TextureFormat::Depth32FloatStencil8 => dxgiformat::DXGI_FORMAT_R32G8X24_TYPELESS,
-        wgt::TextureFormat::Depth24Plus | wgt::TextureFormat::Depth24PlusStencil8 => {
-            dxgiformat::DXGI_FORMAT_R24G8_TYPELESS
-        }
+        wgt::TextureFormat::Depth24Plus
+        | wgt::TextureFormat::Depth24PlusStencil8
+        | wgt::TextureFormat::Depth24UnormStencil8 => dxgiformat::DXGI_FORMAT_R24G8_TYPELESS,
         _ => unreachable!(),
     }
 }
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index d05f1c8d0f..c8d390fcfe 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -41,6 +41,10 @@ impl super::Adapter {
         }
     }
 
+    pub fn raw_adapter(&self) -> &native::DxgiAdapter {
+        &self.raw
+    }
+
     #[allow(trivial_casts)]
     pub(super) fn expose(
         adapter: native::DxgiAdapter,
@@ -185,6 +189,7 @@ impl super::Adapter {
 
         let mut features = wgt::Features::empty()
             | wgt::Features::DEPTH_CLIP_CONTROL
+            | wgt::Features::DEPTH24UNORM_STENCIL8
             | wgt::Features::DEPTH32FLOAT_STENCIL8
             | wgt::Features::INDIRECT_FIRST_INSTANCE
             | wgt::Features::MAPPABLE_PRIMARY_BUFFERS
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 44026341e3..90f845813b 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -662,7 +662,8 @@ impl crate::Adapter<super::Api> for super::Adapter {
             Tf::Depth32Float
             | Tf::Depth32FloatStencil8
             | Tf::Depth24Plus
-            | Tf::Depth24PlusStencil8 => depth,
+            | Tf::Depth24PlusStencil8
+            | Tf::Depth24UnormStencil8 => depth,
             Tf::Rgb9e5Ufloat
             | Tf::Bc1RgbaUnorm
             | Tf::Bc1RgbaUnormSrgb
diff --git a/wgpu-hal/src/gles/conv.rs b/wgpu-hal/src/gles/conv.rs
index fcd6e2791a..bf0f3bf0bd 100644
--- a/wgpu-hal/src/gles/conv.rs
+++ b/wgpu-hal/src/gles/conv.rs
@@ -65,7 +65,7 @@ impl super::AdapterShared {
                 glow::DEPTH_COMPONENT,
                 glow::UNSIGNED_NORMALIZED,
             ),
-            Tf::Depth24PlusStencil8 => (
+            Tf::Depth24PlusStencil8 | Tf::Depth24UnormStencil8 => (
                 glow::DEPTH24_STENCIL8,
                 glow::DEPTH_COMPONENT,
                 glow::UNSIGNED_INT,
diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs
index 9539fe211e..06167ec841 100644
--- a/wgpu-hal/src/gles/egl.rs
+++ b/wgpu-hal/src/gles/egl.rs
@@ -580,6 +580,24 @@ pub struct Instance {
     inner: Mutex<Inner>,
 }
 
+impl Instance {
+    pub fn raw_display(&self) -> egl::Display {
+        self.inner
+            .try_lock()
+            .expect("Could not lock instance. This is most-likely a deadlock.")
+            .egl
+            .display
+    }
+
+    /// Returns the version of the EGL display.
+    pub fn egl_version(&self) -> (i32, i32) {
+        self.inner
+            .try_lock()
+            .expect("Could not lock instance. This is most-likely a deadlock.")
+            .version
+    }
+}
+
 unsafe impl Send for Instance {}
 unsafe impl Sync for Instance {}
 
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 0737053514..6f1c2dde1e 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -603,9 +603,9 @@ impl From<wgt::TextureFormat> for FormatAspects {
     fn from(format: wgt::TextureFormat) -> Self {
         match format {
             wgt::TextureFormat::Depth32Float | wgt::TextureFormat::Depth24Plus => Self::DEPTH,
-            wgt::TextureFormat::Depth32FloatStencil8 | wgt::TextureFormat::Depth24PlusStencil8 => {
-                Self::DEPTH | Self::STENCIL
-            }
+            wgt::TextureFormat::Depth32FloatStencil8
+            | wgt::TextureFormat::Depth24PlusStencil8
+            | wgt::TextureFormat::Depth24UnormStencil8 => Self::DEPTH | Self::STENCIL,
             _ => Self::COLOR,
         }
     }
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index 2f60f50718..f59237100f 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -200,6 +200,12 @@ impl crate::Adapter<super::Api> for super::Adapter {
                 }
                 flags
             }
+            Tf::Depth24UnormStencil8 => {
+                Tfc::DEPTH_STENCIL_ATTACHMENT
+                    | Tfc::SAMPLED_LINEAR
+                    | Tfc::MULTISAMPLE
+                    | Tfc::MULTISAMPLE_RESOLVE
+            }
             Tf::Rgb9e5Ufloat => {
                 if pc.msaa_apple3 {
                     all_caps
@@ -762,6 +768,7 @@ impl super::PrivateCapabilities {
         features.set(F::TEXTURE_COMPRESSION_ETC2, self.format_eac_etc);
 
         features.set(F::DEPTH_CLIP_CONTROL, self.supports_depth_clip_control);
+        features.set(F::DEPTH24UNORM_STENCIL8, self.format_depth24_stencil8);
 
         features.set(
             F::TEXTURE_BINDING_ARRAY
@@ -909,6 +916,7 @@ impl super::PrivateCapabilities {
                     Depth32Float_Stencil8
                 }
             }
+            Tf::Depth24UnormStencil8 => Depth24Unorm_Stencil8,
             Tf::Rgb9e5Ufloat => RGB9E5Float,
             Tf::Bc1RgbaUnorm => BC1_RGBA,
             Tf::Bc1RgbaUnormSrgb => BC1_RGBA_sRGB,
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 4a016714cd..e2b914cb6a 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -566,9 +566,16 @@ impl PhysicalDeviceFeatures {
             caps.supports_format(
                 vk::Format::D32_SFLOAT_S8_UINT,
                 vk::ImageTiling::OPTIMAL,
-                vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT
-                    | vk::FormatFeatureFlags::SAMPLED_IMAGE
-                    | vk::FormatFeatureFlags::TRANSFER_SRC,
+                vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT,
+            ),
+        );
+
+        features.set(
+            F::DEPTH24UNORM_STENCIL8,
+            caps.supports_format(
+                vk::Format::D24_UNORM_S8_UINT,
+                vk::ImageTiling::OPTIMAL,
+                vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT,
             ),
         );
 
diff --git a/wgpu-hal/src/vulkan/conv.rs b/wgpu-hal/src/vulkan/conv.rs
index 83fc318486..df24e8e6a7 100644
--- a/wgpu-hal/src/vulkan/conv.rs
+++ b/wgpu-hal/src/vulkan/conv.rs
@@ -64,6 +64,7 @@ impl super::PrivateCapabilities {
                     F::D32_SFLOAT_S8_UINT
                 }
             }
+            Tf::Depth24UnormStencil8 => F::D24_UNORM_S8_UINT,
             Tf::Rgb9e5Ufloat => F::E5B9G9R9_UFLOAT_PACK32,
             Tf::Bc1RgbaUnorm => F::BC1_RGBA_UNORM_BLOCK,
             Tf::Bc1RgbaUnormSrgb => F::BC1_RGBA_SRGB_BLOCK,
diff --git a/wgpu-types/LICENSE.APACHE b/wgpu-types/LICENSE.APACHE
new file mode 120000
index 0000000000..7141cad5b2
--- /dev/null
+++ b/wgpu-types/LICENSE.APACHE
@@ -0,0 +1 @@
+../LICENSE.APACHE
\ No newline at end of file
diff --git a/wgpu-types/LICENSE.MIT b/wgpu-types/LICENSE.MIT
new file mode 120000
index 0000000000..6b8772d1a7
--- /dev/null
+++ b/wgpu-types/LICENSE.MIT
@@ -0,0 +1 @@
+../LICENSE.MIT
\ No newline at end of file
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 88e2a01205..d9144b8a8f 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -185,6 +185,15 @@ bitflags::bitflags! {
         ///
         /// This is a web and native feature.
         const DEPTH_CLIP_CONTROL = 1 << 0;
+        /// Allows for explicit creation of textures of format [`TextureFormat::Depth24UnormStencil8`]
+        ///
+        /// Supported platforms:
+        /// - Vulkan (some)
+        /// - DX12
+        /// - Metal (Macs with amd GPUs)
+        ///
+        /// This is a web and native feature.
+        const DEPTH24UNORM_STENCIL8 = 1 << 1;
         /// Allows for explicit creation of textures of format [`TextureFormat::Depth32FloatStencil8`]
         ///
         /// Supported platforms:
@@ -193,7 +202,7 @@ bitflags::bitflags! {
         /// - Metal
         ///
         /// This is a web and native feature.
-        const DEPTH32FLOAT_STENCIL8 = 1 << 1;
+        const DEPTH32FLOAT_STENCIL8 = 1 << 2;
         /// Enables BCn family of compressed textures. All BCn textures use 4x4 pixel blocks
         /// with 8 or 16 bytes per block.
         ///
@@ -207,7 +216,7 @@ bitflags::bitflags! {
         /// - desktops
         ///
         /// This is a web and native feature.
-        const TEXTURE_COMPRESSION_BC = 1 << 2;
+        const TEXTURE_COMPRESSION_BC = 1 << 3;
         /// Enables ETC family of compressed textures. All ETC textures use 4x4 pixel blocks.
         /// ETC2 RGB and RGBA1 are 8 bytes per block. RTC2 RGBA8 and EAC are 16 bytes per block.
         ///
@@ -222,7 +231,7 @@ bitflags::bitflags! {
         /// - Mobile (some)
         ///
         /// This is a web and native feature.
-        const TEXTURE_COMPRESSION_ETC2 = 1 << 3;
+        const TEXTURE_COMPRESSION_ETC2 = 1 << 4;
         /// Enables ASTC family of compressed textures. ASTC textures use pixel blocks varying from 4x4 to 12x12.
         /// Blocks are always 16 bytes.
         ///
@@ -237,7 +246,7 @@ bitflags::bitflags! {
         /// - Mobile (some)
         ///
         /// This is a web and native feature.
-        const TEXTURE_COMPRESSION_ASTC_LDR = 1 << 4;
+        const TEXTURE_COMPRESSION_ASTC_LDR = 1 << 5;
         /// Allows non-zero value for the "first instance" in indirect draw calls.
         ///
         /// Supported Platforms:
@@ -246,7 +255,7 @@ bitflags::bitflags! {
         /// - Metal
         ///
         /// This is a web and native feature.
-        const INDIRECT_FIRST_INSTANCE = 1 << 5;
+        const INDIRECT_FIRST_INSTANCE = 1 << 6;
         /// Enables use of Timestamp Queries. These queries tell the current gpu timestamp when
         /// all work before the query is finished. Call [`CommandEncoder::write_timestamp`],
         /// [`RenderPassEncoder::write_timestamp`], or [`ComputePassEncoder::write_timestamp`] to
@@ -264,7 +273,7 @@ bitflags::bitflags! {
         /// - DX12 (works)
         ///
         /// This is a web and native feature.
-        const TIMESTAMP_QUERY = 1 << 6;
+        const TIMESTAMP_QUERY = 1 << 7;
         /// Enables use of Pipeline Statistics Queries. These queries tell the count of various operations
         /// performed between the start and stop call. Call [`RenderPassEncoder::begin_pipeline_statistics_query`] to start
         /// a query, then call [`RenderPassEncoder::end_pipeline_statistics_query`] to stop one.
@@ -279,7 +288,7 @@ bitflags::bitflags! {
         /// - DX12 (works)
         ///
         /// This is a web and native feature.
-        const PIPELINE_STATISTICS_QUERY = 1 << 7;
+        const PIPELINE_STATISTICS_QUERY = 1 << 8;
         /// Allows shaders to acquire the FP16 ability
         ///
         /// Note: this is not supported in naga yet，only through spir-v passthrough right now.
@@ -289,7 +298,7 @@ bitflags::bitflags! {
         /// - Metal
         ///
         /// This is a web and native feature.
-        const SHADER_FLOAT16 = 1 << 8;
+        const SHADER_FLOAT16 = 1 << 9;
         /// Webgpu only allows the MAP_READ and MAP_WRITE buffer usage to be matched with
         /// COPY_DST and COPY_SRC respectively. This removes this requirement.
         ///
@@ -1856,6 +1865,9 @@ pub enum TextureFormat {
     /// Special depth/stencil format with at least 24 bit integer depth and 8 bits integer stencil.
     #[cfg_attr(feature = "serde", serde(rename = "depth24plus-stencil8"))]
     Depth24PlusStencil8,
+    /// Special depth/stencil format with 24 bit integer depth and 8 bits integer stencil.
+    #[cfg_attr(feature = "serde", serde(rename = "depth24unorm-stencil8"))]
+    Depth24UnormStencil8,
 
     // Packed uncompressed texture formats
     /// Packed unsigned float with 9 bits mantisa for each RGB component, then a common 5 bits exponent
@@ -2059,6 +2071,7 @@ impl TextureFormat {
         let astc_hdr = Features::TEXTURE_COMPRESSION_ASTC_HDR;
         let norm16bit = Features::TEXTURE_FORMAT_16BIT_NORM;
         let d32_s8 = Features::DEPTH32FLOAT_STENCIL8;
+        let d24_s8 = Features::DEPTH24UNORM_STENCIL8;
 
         // Sample Types
         let uint = TextureSampleType::Uint;
@@ -2151,6 +2164,7 @@ impl TextureFormat {
             Self::Depth32FloatStencil8 =>(   d32_s8,   depth,    linear,         msaa, (1, 1),  4, attachment, 2),
             Self::Depth24Plus =>         (   native,   depth,    linear,         msaa, (1, 1),  4, attachment, 1),
             Self::Depth24PlusStencil8 => (   native,   depth,    linear,         msaa, (1, 1),  4, attachment, 2),
+            Self::Depth24UnormStencil8 => (  d24_s8,   depth,    linear,         msaa, (1, 1),  4, attachment, 2),
 
             // Packed uncompressed  
             Self::Rgb9e5Ufloat =>        (   native,   float,    linear,         noaa, (1, 1),  4,      basic, 3),
diff --git a/wgpu/Cargo.toml b/wgpu/Cargo.toml
index 834eeefe87..8a9cc0c455 100644
--- a/wgpu/Cargo.toml
+++ b/wgpu/Cargo.toml
@@ -123,6 +123,7 @@ bitflags = "1"
 bytemuck = { version = "1.4", features = ["derive"] }
 glam = "0.20.2"
 ddsfile = "0.5"
+futures-intrusive = "0.4"
 log = "0.4"
 # Opt out of noise's "default-features" to avoid "image" feature as a dependency count optimization.
 # This will not be required in the next release since it has been removed from the default feature in https://github.com/Razaekel/noise-rs/commit/1af9e1522236b2c584fb9a02150c9c67a5e6bb04#diff-2e9d962a08321605940b5a657135052fbcef87b5e360662bb527c96d9a615542
diff --git a/wgpu/LICENSE.APACHE b/wgpu/LICENSE.APACHE
new file mode 120000
index 0000000000..7141cad5b2
--- /dev/null
+++ b/wgpu/LICENSE.APACHE
@@ -0,0 +1 @@
+../LICENSE.APACHE
\ No newline at end of file
diff --git a/wgpu/LICENSE.MIT b/wgpu/LICENSE.MIT
new file mode 120000
index 0000000000..6b8772d1a7
--- /dev/null
+++ b/wgpu/LICENSE.MIT
@@ -0,0 +1 @@
+../LICENSE.MIT
\ No newline at end of file
diff --git a/wgpu/examples/capture/main.rs b/wgpu/examples/capture/main.rs
index 52aa8fad2c..6e85a9a16a 100644
--- a/wgpu/examples/capture/main.rs
+++ b/wgpu/examples/capture/main.rs
@@ -135,21 +135,23 @@ async fn create_png(
 ) {
     // Note that we're not calling `.await` here.
     let buffer_slice = output_buffer.slice(..);
-    let buffer_future = buffer_slice.map_async(wgpu::MapMode::Read);
+    // Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished.
+    let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+    buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
 
     // Poll the device in a blocking manner so that our future resolves.
     // In an actual application, `device.poll(...)` should
     // be called in an event loop or on another thread.
     //
     // We pass our submission index so we don't need to wait for any other possible submissions.
-    device.poll(wgpu::Maintain::Wait(Some(submission_index)));
+    device.poll(wgpu::Maintain::WaitForSubmissionIndex(submission_index));
     // If a file system is available, write the buffer as a PNG
     let has_file_system_available = cfg!(not(target_arch = "wasm32"));
     if !has_file_system_available {
         return;
     }
 
-    if let Ok(()) = buffer_future.await {
+    if let Some(Ok(())) = receiver.receive().await {
         let padded_buffer = buffer_slice.get_mapped_range();
 
         let mut png_encoder = png::Encoder::new(
@@ -225,18 +227,15 @@ mod tests {
 
     #[test]
     fn ensure_generated_data_matches_expected() {
-        pollster::block_on(assert_generated_data_matches_expected());
+        assert_generated_data_matches_expected();
     }
 
-    async fn assert_generated_data_matches_expected() {
+    fn assert_generated_data_matches_expected() {
         let (device, output_buffer, dimensions) =
             create_red_image_with_dimensions(100usize, 200usize).await;
         let buffer_slice = output_buffer.slice(..);
-        let buffer_future = buffer_slice.map_async(wgpu::MapMode::Read);
+        buffer_slice.map_async(wgpu::MapMode::Read, |_| ());
         device.poll(wgpu::Maintain::Wait);
-        buffer_future
-            .await
-            .expect("failed to map buffer slice for capture test");
         let padded_buffer = buffer_slice.get_mapped_range();
         let expected_buffer_size = dimensions.padded_bytes_per_row * dimensions.height;
         assert_eq!(padded_buffer.len(), expected_buffer_size);
diff --git a/wgpu/examples/cube/shader.wgsl b/wgpu/examples/cube/shader.wgsl
index e79bd02610..8e9fa6c495 100644
--- a/wgpu/examples/cube/shader.wgsl
+++ b/wgpu/examples/cube/shader.wgsl
@@ -30,6 +30,6 @@ fn fs_main(vertex: VertexOutput) -> @location(0) vec4<f32> {
 }
 
 @fragment
-fn fs_wire() -> @location(0) vec4<f32> {
+fn fs_wire(vertex: VertexOutput) -> @location(0) vec4<f32> {
     return vec4<f32>(0.0, 0.5, 0.0, 0.5);
 }
diff --git a/wgpu/examples/framework.rs b/wgpu/examples/framework.rs
index 0ed135f1cd..63c91fd80e 100644
--- a/wgpu/examples/framework.rs
+++ b/wgpu/examples/framework.rs
@@ -517,7 +517,7 @@ pub fn test<E: Example>(mut params: FrameworkRefTest) {
             ctx.queue.submit(Some(cmd_buf.finish()));
 
             let dst_buffer_slice = dst_buffer.slice(..);
-            let _ = dst_buffer_slice.map_async(wgpu::MapMode::Read);
+            dst_buffer_slice.map_async(wgpu::MapMode::Read, |_| ());
             ctx.device.poll(wgpu::Maintain::Wait);
             let bytes = dst_buffer_slice.get_mapped_range().to_vec();
 
diff --git a/wgpu/examples/hello-compute/main.rs b/wgpu/examples/hello-compute/main.rs
index bbff9130f4..df157cbfec 100644
--- a/wgpu/examples/hello-compute/main.rs
+++ b/wgpu/examples/hello-compute/main.rs
@@ -147,8 +147,9 @@ async fn execute_gpu_inner(
 
     // Note that we're not calling `.await` here.
     let buffer_slice = staging_buffer.slice(..);
-    // Gets the future representing when `staging_buffer` can be read from
-    let buffer_future = buffer_slice.map_async(wgpu::MapMode::Read);
+    // Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished.
+    let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+    buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
 
     // Poll the device in a blocking manner so that our future resolves.
     // In an actual application, `device.poll(...)` should
@@ -156,7 +157,7 @@ async fn execute_gpu_inner(
     device.poll(wgpu::Maintain::Wait);
 
     // Awaits until `buffer_future` can be read from
-    if let Ok(()) = buffer_future.await {
+    if let Some(Ok(())) = receiver.receive().await {
         // Gets contents of buffer
         let data = buffer_slice.get_mapped_range();
         // Since contents are got in bytes, this converts these bytes back to u32
diff --git a/wgpu/examples/mipmap/main.rs b/wgpu/examples/mipmap/main.rs
index 9603895d57..cd31863b80 100644
--- a/wgpu/examples/mipmap/main.rs
+++ b/wgpu/examples/mipmap/main.rs
@@ -380,11 +380,11 @@ impl framework::Example for Example {
 
         queue.submit(Some(init_encoder.finish()));
         if let Some(ref query_sets) = query_sets {
-            // We can ignore the future as we're about to wait for the device.
-            let _ = query_sets
+            // We can ignore the callback as we're about to wait for the device.
+            query_sets
                 .data_buffer
                 .slice(..)
-                .map_async(wgpu::MapMode::Read);
+                .map_async(wgpu::MapMode::Read, |_| ());
             // Wait for device to be done rendering mipmaps
             device.poll(wgpu::Maintain::Wait);
             // This is guaranteed to be ready.
diff --git a/wgpu/examples/skybox/main.rs b/wgpu/examples/skybox/main.rs
index 9dcd3ece82..2b8f6d8f96 100644
--- a/wgpu/examples/skybox/main.rs
+++ b/wgpu/examples/skybox/main.rs
@@ -398,7 +398,7 @@ impl framework::Example for Skybox {
         view: &wgpu::TextureView,
         device: &wgpu::Device,
         queue: &wgpu::Queue,
-        spawner: &framework::Spawner,
+        _spawner: &framework::Spawner,
     ) {
         let mut encoder =
             device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
@@ -457,8 +457,7 @@ impl framework::Example for Skybox {
 
         queue.submit(std::iter::once(encoder.finish()));
 
-        let belt_future = self.staging_belt.recall();
-        spawner.spawn_local(belt_future);
+        self.staging_belt.recall();
     }
 }
 
diff --git a/wgpu/src/backend/direct.rs b/wgpu/src/backend/direct.rs
index e62ada1a00..58acf88c3b 100644
--- a/wgpu/src/backend/direct.rs
+++ b/wgpu/src/backend/direct.rs
@@ -1,8 +1,8 @@
 use crate::{
-    backend::native_gpu_future, AdapterInfo, BindGroupDescriptor, BindGroupLayoutDescriptor,
-    BindingResource, BufferBinding, CommandEncoderDescriptor, ComputePassDescriptor,
-    ComputePipelineDescriptor, DownlevelCapabilities, Features, Label, Limits, LoadOp, MapMode,
-    Operations, PipelineLayoutDescriptor, RenderBundleEncoderDescriptor, RenderPipelineDescriptor,
+    AdapterInfo, BindGroupDescriptor, BindGroupLayoutDescriptor, BindingResource, BufferBinding,
+    CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor,
+    DownlevelCapabilities, Features, Label, Limits, LoadOp, MapMode, Operations,
+    PipelineLayoutDescriptor, RenderBundleEncoderDescriptor, RenderPipelineDescriptor,
     SamplerDescriptor, ShaderModuleDescriptor, ShaderModuleDescriptorSpirV, ShaderSource,
     SurfaceStatus, TextureDescriptor, TextureFormat, TextureViewDescriptor,
 };
@@ -806,8 +806,6 @@ impl crate::Context for Context {
     #[allow(clippy::type_complexity)]
     type RequestDeviceFuture =
         Ready<Result<(Self::DeviceId, Self::QueueId), crate::RequestDeviceError>>;
-    type MapAsyncFuture = native_gpu_future::GpuFuture<Result<(), crate::BufferAsyncError>>;
-    type OnSubmittedWorkDoneFuture = native_gpu_future::GpuFuture<()>;
     type PopErrorScopeFuture = Ready<Option<crate::Error>>;
 
     fn init(backends: wgt::Backends) -> Self {
@@ -1622,28 +1620,20 @@ impl crate::Context for Context {
         buffer: &Self::BufferId,
         mode: MapMode,
         range: Range<wgt::BufferAddress>,
-    ) -> Self::MapAsyncFuture {
-        let (future, completion) = native_gpu_future::new_gpu_future();
-
-        extern "C" fn buffer_map_future_wrapper(
-            status: wgc::resource::BufferMapAsyncStatus,
-            user_data: *mut u8,
-        ) {
-            let completion =
-                unsafe { native_gpu_future::GpuFutureCompletion::from_raw(user_data as _) };
-            completion.complete(match status {
-                wgc::resource::BufferMapAsyncStatus::Success => Ok(()),
-                _ => Err(crate::BufferAsyncError),
-            })
-        }
-
+        callback: impl FnOnce(Result<(), crate::BufferAsyncError>) + Send + 'static,
+    ) {
         let operation = wgc::resource::BufferMapOperation {
             host: match mode {
                 MapMode::Read => wgc::device::HostMap::Read,
                 MapMode::Write => wgc::device::HostMap::Write,
             },
-            callback: buffer_map_future_wrapper,
-            user_data: completion.into_raw() as _,
+            callback: wgc::resource::BufferMapCallback::from_rust(Box::new(|status| {
+                let res = match status {
+                    wgc::resource::BufferMapAsyncStatus::Success => Ok(()),
+                    _ => Err(crate::BufferAsyncError),
+                };
+                callback(res);
+            })),
         };
 
         let global = &self.0;
@@ -1651,7 +1641,6 @@ impl crate::Context for Context {
             Ok(()) => (),
             Err(cause) => self.handle_error_nolabel(&buffer.error_sink, cause, "Buffer::map_async"),
         }
-        future
     }
 
     fn buffer_get_mapped_range(
@@ -2216,26 +2205,15 @@ impl crate::Context for Context {
     fn queue_on_submitted_work_done(
         &self,
         queue: &Self::QueueId,
-    ) -> Self::OnSubmittedWorkDoneFuture {
-        let (future, completion) = native_gpu_future::new_gpu_future();
-
-        extern "C" fn submitted_work_done_future_wrapper(user_data: *mut u8) {
-            let completion =
-                unsafe { native_gpu_future::GpuFutureCompletion::from_raw(user_data as _) };
-            completion.complete(())
-        }
-
-        let closure = wgc::device::queue::SubmittedWorkDoneClosure {
-            callback: submitted_work_done_future_wrapper,
-            user_data: completion.into_raw() as _,
-        };
+        callback: Box<dyn FnOnce() + Send + 'static>,
+    ) {
+        let closure = wgc::device::queue::SubmittedWorkDoneClosure::from_rust(callback);
 
         let global = &self.0;
         let res = wgc::gfx_select!(queue => global.queue_on_submitted_work_done(*queue, closure));
         if let Err(cause) = res {
             self.handle_error_fatal(cause, "Queue::on_submitted_work_done");
         }
-        future
     }
 
     fn device_start_capture(&self, device: &Self::DeviceId) {
diff --git a/wgpu/src/backend/mod.rs b/wgpu/src/backend/mod.rs
index e73d66ad89..abd090e086 100644
--- a/wgpu/src/backend/mod.rs
+++ b/wgpu/src/backend/mod.rs
@@ -7,6 +7,3 @@ pub(crate) use web::{BufferMappedRange, Context};
 mod direct;
 #[cfg(any(not(target_arch = "wasm32"), feature = "webgl"))]
 pub(crate) use direct::{BufferMappedRange, Context};
-
-#[cfg(any(not(target_arch = "wasm32"), feature = "webgl"))]
-mod native_gpu_future;
diff --git a/wgpu/src/backend/native_gpu_future.rs b/wgpu/src/backend/native_gpu_future.rs
deleted file mode 100644
index f80fba7c37..0000000000
--- a/wgpu/src/backend/native_gpu_future.rs
+++ /dev/null
@@ -1,143 +0,0 @@
-//! Futures that can be resolved when the GPU completes a task.
-//!
-//! This module defines the [`GpuFuture`] and [`GpuFutureCompletion`]
-//! types, which `wgpu` uses to communicate to users when GPU
-//! operations have completed, and when resources are ready to access.
-//! This is only used by the `direct` back end, not on the web.
-//!
-//! The life cycle of a `GpuFuture` is as follows:
-//!
-//! -   Calling [`new_gpu_future`] constructs a paired `GpuFuture` and
-//!     `GpuFutureCompletion`.
-//!
-//! -   Calling [`complete(v)`] on a `GpuFutureCompletion` marks its
-//!     paired `GpuFuture` as ready with value `v`. This also wakes
-//!     the most recent [`Waker`] the future was polled with, if any.
-//!
-//! -   Polling a `GpuFuture` either returns `v` if it is ready, or
-//!     saves the `Waker` passed to [`Future::poll`], to be awoken
-//!     when `complete` is called on the paired `GpuFutureCompletion`.
-//!
-//! ## Communicating with `wgpu_core`
-//!
-//! The `wgpu_core` crate uses various specialized callback types,
-//! like [`wgpu_core::resource::BufferMapOperation`] for reporting
-//! buffers that are ready to map, or
-//! [`wgpu_core::device::queue::SubmittedWorkDoneClosure`] for
-//! reporting the completion of submitted commands. To support FFI
-//! bindings, these are unsafe, low-level structures that usually have
-//! a function pointer and a untyped, raw "closure" pointer.
-//!
-//! Calling [`GpuFutureCompletion::into_raw`] returns a raw opaque
-//! pointer suitable for use as the "closure" pointer in `wgpu_core`'s
-//! callbacks. The [`GpuFutureCompletion::from_raw`] converts such a
-//! raw opaque pointer back into a [`GpuFutureCompletion`]. See the
-//! direct back end's implementation of [`Context::buffer_map_async`]
-//! for an example of this.
-//!
-//! [`complete(v)`]: GpuFutureCompletion::complete
-//! [`Waker`]: std::task::Waker
-//! [`Future::poll`]: std::future::Future::poll
-//! [`wgpu_core::resource::BufferMapOperation`]: https://docs.rs/wgpu-core/latest/wgpu_core/resource/struct.BufferMapOperation.html
-//! [`wgpu_core::device::queue::SubmittedWorkDoneClosure`]: https://docs.rs/wgpu-core/latest/wgpu_core/device/queue/struct.SubmittedWorkDoneClosure.html
-//! [`Context::buffer_map_async`]: crate::Context::buffer_map_async
-use parking_lot::Mutex;
-use std::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
-
-/// The current state of a `GpuFuture`.
-enum WakerOrResult<T> {
-    /// The last [`Waker`] used to poll this future, if any.
-    ///
-    /// [`Waker`]: std::task::Waker
-    Waker(Waker),
-
-    /// The value this future resolves to, if it is ready.
-    Result(T),
-}
-
-/// The shared state of a [`GpuFuture`] and its [`GpuFutureCompletion`].
-///
-/// Polling the future when it is not yet ready stores the [`Waker`]
-/// here; completing the future when it has not yet been polled stores
-/// the value here. See [`WakerOrResult`] for details.
-type GpuFutureData<T> = Mutex<Option<WakerOrResult<T>>>;
-
-/// A [`Future`] that will be ready when some sort of GPU activity has finished.
-///
-/// Call [`new_gpu_future`] to create a `GpuFuture`, along with a
-/// paired `GpuFutureCompletion` that can be used to mark it as ready.
-pub struct GpuFuture<T> {
-    data: Arc<GpuFutureData<T>>,
-}
-
-/// An opaque type used for pointers to a [`GpuFutureCompletion`]'s guts.
-pub enum OpaqueData {}
-
-//TODO: merge this with `GpuFuture` and avoid `Arc` on the data.
-/// A completion handle to set the result on a [`GpuFuture`].
-pub struct GpuFutureCompletion<T> {
-    data: Arc<GpuFutureData<T>>,
-}
-
-impl<T> Future for GpuFuture<T> {
-    type Output = T;
-
-    fn poll(self: Pin<&mut Self>, context: &mut Context) -> Poll<Self::Output> {
-        let mut waker_or_result = self.into_ref().get_ref().data.lock();
-
-        match waker_or_result.take() {
-            Some(WakerOrResult::Result(res)) => Poll::Ready(res),
-            _ => {
-                *waker_or_result = Some(WakerOrResult::Waker(context.waker().clone()));
-                Poll::Pending
-            }
-        }
-    }
-}
-
-impl<T> GpuFutureCompletion<T> {
-    /// Mark our paired [`GpuFuture`] as ready, with the given `value`.
-    pub fn complete(self, value: T) {
-        let mut waker_or_result = self.data.lock();
-
-        match waker_or_result.replace(WakerOrResult::Result(value)) {
-            Some(WakerOrResult::Waker(waker)) => waker.wake(),
-            None => {}
-            Some(WakerOrResult::Result(_)) => {
-                // Drop before panicking. Not sure if this is necessary, but it makes me feel better.
-                drop(waker_or_result);
-                unreachable!()
-            }
-        };
-    }
-
-    /// Convert this `GpuFutureCompletion` into a raw pointer for `wgpu_core` to hold.
-    pub(crate) fn into_raw(self) -> *mut OpaqueData {
-        Arc::into_raw(self.data) as _
-    }
-
-    /// Convert a raw pointer returned by [`into_raw`] back into a `GpuFutureCompletion`.
-    ///
-    /// [`into_raw`]: GpuFutureCompletion::into_raw
-    pub(crate) unsafe fn from_raw(this: *mut OpaqueData) -> Self {
-        Self {
-            data: Arc::from_raw(this as _),
-        }
-    }
-}
-
-/// Construct a fresh [`GpuFuture`] and a paired [`GpuFutureCompletion`].
-///
-/// See the module docs for details.
-pub(crate) fn new_gpu_future<T>() -> (GpuFuture<T>, GpuFutureCompletion<T>) {
-    let data = Arc::new(Mutex::new(None));
-    (
-        GpuFuture {
-            data: Arc::clone(&data),
-        },
-        GpuFutureCompletion { data },
-    )
-}
diff --git a/wgpu/src/backend/web.rs b/wgpu/src/backend/web.rs
index 48738f4f18..af496942c8 100644
--- a/wgpu/src/backend/web.rs
+++ b/wgpu/src/backend/web.rs
@@ -1,10 +1,12 @@
 #![allow(clippy::type_complexity)]
 
 use std::{
+    cell::RefCell,
     fmt,
     future::Future,
     ops::Range,
     pin::Pin,
+    rc::Rc,
     task::{self, Poll},
 };
 use wasm_bindgen::{prelude::*, JsCast};
@@ -546,6 +548,7 @@ fn map_texture_format(texture_format: wgt::TextureFormat) -> web_sys::GpuTexture
         TextureFormat::Depth32FloatStencil8 => tf::Depth32floatStencil8,
         TextureFormat::Depth24Plus => tf::Depth24plus,
         TextureFormat::Depth24PlusStencil8 => tf::Depth24plusStencil8,
+        TextureFormat::Depth24UnormStencil8 => tf::Depth24unormStencil8,
         _ => unimplemented!(),
     }
 }
@@ -595,6 +598,7 @@ fn map_texture_format_from_web_sys(
         tf::Depth32floatStencil8 => TextureFormat::Depth32FloatStencil8,
         tf::Depth24plus => TextureFormat::Depth24Plus,
         tf::Depth24plusStencil8 => TextureFormat::Depth24PlusStencil8,
+        tf::Depth24unormStencil8 => TextureFormat::Depth24UnormStencil8,
         _ => unimplemented!(),
     }
 }
@@ -933,10 +937,6 @@ fn future_request_device(
         .map_err(|_| crate::RequestDeviceError)
 }
 
-fn future_map_async(result: JsFutureResult) -> Result<(), crate::BufferAsyncError> {
-    result.map(|_| ()).map_err(|_| crate::BufferAsyncError)
-}
-
 fn future_pop_error_scope(result: JsFutureResult) -> Option<crate::Error> {
     match result {
         Ok(js_value) if js_value.is_object() => {
@@ -1005,12 +1005,6 @@ impl crate::Context for Context {
         wasm_bindgen_futures::JsFuture,
         fn(JsFutureResult) -> Result<(Self::DeviceId, Self::QueueId), crate::RequestDeviceError>,
     >;
-    type MapAsyncFuture = MakeSendFuture<
-        wasm_bindgen_futures::JsFuture,
-        fn(JsFutureResult) -> Result<(), crate::BufferAsyncError>,
-    >;
-    type OnSubmittedWorkDoneFuture =
-        MakeSendFuture<wasm_bindgen_futures::JsFuture, fn(JsFutureResult) -> ()>;
     type PopErrorScopeFuture =
         MakeSendFuture<wasm_bindgen_futures::JsFuture, fn(JsFutureResult) -> Option<crate::Error>>;
 
@@ -1092,8 +1086,14 @@ impl crate::Context for Context {
         let possible_features = [
             //TODO: update the name
             (wgt::Features::DEPTH_CLIP_CONTROL, Gfn::DepthClamping),
-            // TODO (_, Gfn::Depth24unormStencil8),
-            // TODO (_, Gfn::Depth32floatStencil8),
+            (
+                wgt::Features::DEPTH24UNORM_STENCIL8,
+                Gfn::Depth24unormStencil8,
+            ),
+            (
+                wgt::Features::DEPTH32FLOAT_STENCIL8,
+                Gfn::Depth32floatStencil8,
+            ),
             (
                 wgt::Features::PIPELINE_STATISTICS_QUERY,
                 Gfn::PipelineStatisticsQuery,
@@ -1736,17 +1736,30 @@ impl crate::Context for Context {
         buffer: &Self::BufferId,
         mode: crate::MapMode,
         range: Range<wgt::BufferAddress>,
-    ) -> Self::MapAsyncFuture {
+        callback: impl FnOnce(Result<(), crate::BufferAsyncError>) + Send + 'static,
+    ) {
         let map_promise = buffer.0.map_async_with_f64_and_f64(
             map_map_mode(mode),
             range.start as f64,
             (range.end - range.start) as f64,
         );
 
-        MakeSendFuture::new(
-            wasm_bindgen_futures::JsFuture::from(map_promise),
-            future_map_async,
-        )
+        // Both the 'success' and 'rejected' closures need access to callback, but only one
+        // of them will ever run. We have them both hold a reference to a `Rc<RefCell<Option<impl FnOnce...>>>`,
+        // and then take ownership of callback when invoked.
+        //
+        // We also only need Rc's because these will only ever be called on our thread.
+        let rc_callback = Rc::new(RefCell::new(Some(callback)));
+
+        let rc_callback_clone = rc_callback.clone();
+        let closure_success = wasm_bindgen::closure::Closure::once(move |_| {
+            rc_callback.borrow_mut().take().unwrap()(Ok(()))
+        });
+        let closure_rejected = wasm_bindgen::closure::Closure::once(move |_| {
+            rc_callback_clone.borrow_mut().take().unwrap()(Err(crate::BufferAsyncError))
+        });
+
+        let _ = map_promise.then2(&closure_success, &closure_rejected);
     }
 
     fn buffer_get_mapped_range(
@@ -2216,7 +2229,8 @@ impl crate::Context for Context {
     fn queue_on_submitted_work_done(
         &self,
         _queue: &Self::QueueId,
-    ) -> Self::OnSubmittedWorkDoneFuture {
+        _callback: Box<dyn FnOnce() + Send + 'static>,
+    ) {
         unimplemented!()
     }
 
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index b645927cad..38ae0fab0f 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -192,8 +192,6 @@ trait Context: Debug + Send + Sized + Sync {
     type RequestAdapterFuture: Future<Output = Option<Self::AdapterId>> + Send;
     type RequestDeviceFuture: Future<Output = Result<(Self::DeviceId, Self::QueueId), RequestDeviceError>>
         + Send;
-    type MapAsyncFuture: Future<Output = Result<(), BufferAsyncError>> + Send;
-    type OnSubmittedWorkDoneFuture: Future<Output = ()> + Send;
     type PopErrorScopeFuture: Future<Output = Option<Error>> + Send;
 
     fn init(backends: Backends) -> Self;
@@ -337,7 +335,11 @@ trait Context: Debug + Send + Sized + Sync {
         buffer: &Self::BufferId,
         mode: MapMode,
         range: Range<BufferAddress>,
-    ) -> Self::MapAsyncFuture;
+        // Note: we keep this as an `impl` through the context because the native backend
+        // needs to wrap it with a wrapping closure. queue_on_submitted_work_done doesn't
+        // need this wrapping closure, so can be made a Box immediately.
+        callback: impl FnOnce(Result<(), BufferAsyncError>) + Send + 'static,
+    );
     fn buffer_get_mapped_range(
         &self,
         buffer: &Self::BufferId,
@@ -496,7 +498,12 @@ trait Context: Debug + Send + Sized + Sync {
     fn queue_on_submitted_work_done(
         &self,
         queue: &Self::QueueId,
-    ) -> Self::OnSubmittedWorkDoneFuture;
+        // Note: we force the caller to box this because neither backend needs to
+        // wrap the callback and this prevents us from needing to make more functions
+        // generic than we have to. `buffer_map_async` needs to be wrapped on the native
+        // backend, so we don't box until after it has been wrapped.
+        callback: Box<dyn FnOnce() + Send + 'static>,
+    );
 
     fn device_start_capture(&self, device: &Self::DeviceId);
     fn device_stop_capture(&self, device: &Self::DeviceId);
@@ -2379,20 +2386,20 @@ impl Buffer {
 }
 
 impl<'a> BufferSlice<'a> {
-    //TODO: fn slice(&self) -> Self
-
-    /// Map the buffer. Buffer is ready to map once the future is resolved.
+    /// Map the buffer. Buffer is ready to map once the callback is called.
     ///
-    /// For the future to complete, `device.poll(...)` must be called elsewhere in the runtime, possibly integrated
-    /// into an event loop, run on a separate thread, or continually polled in the same task runtime that this
-    /// future will be run on.
+    /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
+    /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.
     ///
-    /// It's expected that wgpu will eventually supply its own event loop infrastructure that will be easy to integrate
-    /// into other event loops, like winit's.
+    /// The callback will be called on the thread that first calls the above functions after the gpu work
+    /// has completed. There are no restrictions on the code you can run in the callback, however on native the
+    /// call to the function will not complete until the callback returns, so prefer keeping callbacks short
+    /// and used to set flags, send messages, etc.
     pub fn map_async(
         &self,
         mode: MapMode,
-    ) -> impl Future<Output = Result<(), BufferAsyncError>> + Send {
+        callback: impl FnOnce(Result<(), BufferAsyncError>) + Send + 'static,
+    ) {
         let mut mc = self.buffer.map_context.lock();
         assert_eq!(
             mc.initial_range,
@@ -2411,6 +2418,7 @@ impl<'a> BufferSlice<'a> {
             &self.buffer.id,
             mode,
             self.offset..end,
+            callback,
         )
     }
 
@@ -3388,10 +3396,19 @@ impl Queue {
         Context::queue_get_timestamp_period(&*self.context, &self.id)
     }
 
-    /// Returns a future that resolves once all the work submitted by this point
-    /// is done processing on GPU.
-    pub fn on_submitted_work_done(&self) -> impl Future<Output = ()> + Send {
-        Context::queue_on_submitted_work_done(&*self.context, &self.id)
+    /// Registers a callback when the previous call to submit finishes running on the gpu. This callback
+    /// being called implies that all mapped buffer callbacks attached to the same submission have also
+    /// been called.
+    ///
+    /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
+    /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.
+    ///
+    /// The callback will be called on the thread that first calls the above functions after the gpu work
+    /// has completed. There are no restrictions on the code you can run in the callback, however on native the
+    /// call to the function will not complete until the callback returns, so prefer keeping callbacks short
+    /// and used to set flags, send messages, etc.
+    pub fn on_submitted_work_done(&self, callback: impl FnOnce() + Send + 'static) {
+        Context::queue_on_submitted_work_done(&*self.context, &self.id, Box::new(callback))
     }
 }
 
diff --git a/wgpu/src/util/belt.rs b/wgpu/src/util/belt.rs
index efcbe7affa..e65a3a92a3 100644
--- a/wgpu/src/util/belt.rs
+++ b/wgpu/src/util/belt.rs
@@ -3,44 +3,10 @@ use crate::{
     CommandEncoder, Device, MapMode,
 };
 use std::fmt;
-use std::pin::Pin;
-use std::task::{self, Poll};
-use std::{future::Future, sync::mpsc};
-
-// Given a vector of futures, poll each in parallel until all are ready.
-struct Join<F> {
-    futures: Vec<Option<F>>,
-}
-
-impl<F: Future<Output = ()>> Future for Join<F> {
-    type Output = ();
-
-    fn poll(self: Pin<&mut Self>, cx: &mut task::Context) -> Poll<Self::Output> {
-        // This is safe because we have no Drop implementation to violate the Pin requirements and
-        // do not provide any means of moving the inner futures.
-        let all_ready = unsafe {
-            // Poll all remaining futures, removing all that are ready
-            self.get_unchecked_mut().futures.iter_mut().all(|opt| {
-                if let Some(future) = opt {
-                    if Pin::new_unchecked(future).poll(cx) == Poll::Ready(()) {
-                        *opt = None;
-                    }
-                }
-
-                opt.is_none()
-            })
-        };
-
-        if all_ready {
-            Poll::Ready(())
-        } else {
-            Poll::Pending
-        }
-    }
-}
+use std::sync::{mpsc, Arc};
 
 struct Chunk {
-    buffer: Buffer,
+    buffer: Arc<Buffer>,
     size: BufferAddress,
     offset: BufferAddress,
 }
@@ -116,12 +82,12 @@ impl StagingBelt {
         } else {
             let size = self.chunk_size.max(size.get());
             Chunk {
-                buffer: device.create_buffer(&BufferDescriptor {
+                buffer: Arc::new(device.create_buffer(&BufferDescriptor {
                     label: Some("(wgpu internal) StagingBelt staging buffer"),
                     size,
                     usage: BufferUsages::MAP_WRITE | BufferUsages::COPY_SRC,
                     mapped_at_creation: true,
-                }),
+                })),
                 size,
                 offset: 0,
             }
@@ -158,31 +124,23 @@ impl StagingBelt {
     /// Recall all of the closed buffers back to be reused.
     ///
     /// This has to be called after the command encoders written to `write_buffer` are submitted!
-    pub fn recall(&mut self) -> impl Future<Output = ()> + Send {
+    pub fn recall(&mut self) {
         while let Ok(mut chunk) = self.receiver.try_recv() {
             chunk.offset = 0;
             self.free_chunks.push(chunk);
         }
 
         let sender = &self.sender;
-        let futures = self
-            .closed_chunks
-            .drain(..)
-            .map(|chunk| {
-                let sender = sender.clone();
-                let async_buffer = chunk.buffer.slice(..).map_async(MapMode::Write);
-
-                Some(async move {
-                    // The result is ignored
-                    async_buffer.await.ok();
-
-                    // The only possible error is the other side disconnecting, which is fine
+        for chunk in self.closed_chunks.drain(..) {
+            let sender = sender.clone();
+            chunk
+                .buffer
+                .clone()
+                .slice(..)
+                .map_async(MapMode::Write, move |_| {
                     let _ = sender.send(chunk);
-                })
-            })
-            .collect::<Vec<_>>();
-
-        Join { futures }
+                });
+        }
     }
 }
 
diff --git a/wgpu/src/util/mod.rs b/wgpu/src/util/mod.rs
index 0a8820a301..3ce386f07b 100644
--- a/wgpu/src/util/mod.rs
+++ b/wgpu/src/util/mod.rs
@@ -6,7 +6,7 @@ mod encoder;
 mod indirect;
 mod init;
 
-use std::future::Future;
+use std::sync::Arc;
 use std::{
     borrow::Cow,
     mem::{align_of, size_of},
@@ -70,7 +70,7 @@ pub fn make_spirv_raw(data: &[u8]) -> Cow<[u32]> {
 }
 
 /// CPU accessible buffer used to download data back from the GPU.
-pub struct DownloadBuffer(super::Buffer, super::BufferMappedRange);
+pub struct DownloadBuffer(Arc<super::Buffer>, super::BufferMappedRange);
 
 impl DownloadBuffer {
     /// Asynchronously read the contents of a buffer.
@@ -78,18 +78,19 @@ impl DownloadBuffer {
         device: &super::Device,
         queue: &super::Queue,
         buffer: &super::BufferSlice,
-    ) -> impl Future<Output = Result<Self, super::BufferAsyncError>> + Send {
+        callback: impl FnOnce(Result<Self, super::BufferAsyncError>) + Send + 'static,
+    ) {
         let size = match buffer.size {
             Some(size) => size.into(),
             None => buffer.buffer.map_context.lock().total_size - buffer.offset,
         };
 
-        let download = device.create_buffer(&super::BufferDescriptor {
+        let download = Arc::new(device.create_buffer(&super::BufferDescriptor {
             size,
             usage: super::BufferUsages::COPY_DST | super::BufferUsages::MAP_READ,
             mapped_at_creation: false,
             label: None,
-        });
+        }));
 
         let mut encoder =
             device.create_command_encoder(&super::CommandEncoderDescriptor { label: None });
@@ -97,13 +98,22 @@ impl DownloadBuffer {
         let command_buffer: super::CommandBuffer = encoder.finish();
         queue.submit(Some(command_buffer));
 
-        let fut = download.slice(..).map_async(super::MapMode::Read);
-        async move {
-            fut.await?;
-            let mapped_range =
-                super::Context::buffer_get_mapped_range(&*download.context, &download.id, 0..size);
-            Ok(Self(download, mapped_range))
-        }
+        download
+            .clone()
+            .slice(..)
+            .map_async(super::MapMode::Read, move |result| {
+                if let Err(e) = result {
+                    callback(Err(e));
+                    return;
+                }
+
+                let mapped_range = super::Context::buffer_get_mapped_range(
+                    &*download.context,
+                    &download.id,
+                    0..size,
+                );
+                callback(Ok(Self(download, mapped_range)));
+            });
     }
 }
 
diff --git a/wgpu/tests/clear_texture.rs b/wgpu/tests/clear_texture.rs
index 09831d75e2..1b4044e055 100644
--- a/wgpu/tests/clear_texture.rs
+++ b/wgpu/tests/clear_texture.rs
@@ -332,6 +332,22 @@ fn clear_texture_d32_s8() {
     )
 }
 
+#[test]
+fn clear_texture_d24_s8() {
+    initialize_test(
+        TestParameters::default()
+            .features(wgpu::Features::CLEAR_TEXTURE | wgpu::Features::DEPTH24UNORM_STENCIL8),
+        |ctx| {
+            clear_texture_tests(
+                &ctx,
+                &[wgpu::TextureFormat::Depth24UnormStencil8],
+                false,
+                false,
+            );
+        },
+    )
+}
+
 #[test]
 fn clear_texture_2d_bc() {
     initialize_test(
diff --git a/wgpu/tests/vertex_indices/mod.rs b/wgpu/tests/vertex_indices/mod.rs
index fa85ae62d9..7c99ca1e25 100644
--- a/wgpu/tests/vertex_indices/mod.rs
+++ b/wgpu/tests/vertex_indices/mod.rs
@@ -123,7 +123,7 @@ fn pulling_common(
 
     ctx.queue.submit(Some(encoder.finish()));
     let slice = buffer.slice(..);
-    let _ = slice.map_async(wgpu::MapMode::Read);
+    slice.map_async(wgpu::MapMode::Read, |_| ());
     ctx.device.poll(wgpu::Maintain::Wait);
     let data: Vec<u32> = bytemuck::cast_slice(&*slice.get_mapped_range()).to_vec();
 
diff --git a/wgpu/tests/zero_init_texture_after_discard.rs b/wgpu/tests/zero_init_texture_after_discard.rs
index ec77e909f0..2a7bd8786a 100644
--- a/wgpu/tests/zero_init_texture_after_discard.rs
+++ b/wgpu/tests/zero_init_texture_after_discard.rs
@@ -282,7 +282,7 @@ fn copy_texture_to_buffer(
 fn assert_buffer_is_zero(readback_buffer: &wgpu::Buffer, device: &wgpu::Device) {
     {
         let buffer_slice = readback_buffer.slice(..);
-        let _ = buffer_slice.map_async(wgpu::MapMode::Read);
+        buffer_slice.map_async(wgpu::MapMode::Read, |_| ());
         device.poll(wgpu::Maintain::Wait);
         let buffer_view = buffer_slice.get_mapped_range();