From abeb86360e731295498286000ebe2f5807219620 Mon Sep 17 00:00:00 2001
From: teoxoy <28601907+teoxoy@users.noreply.github.com>
Date: Fri, 17 May 2024 17:36:17 +0200
Subject: [PATCH] ensure safety of indirect dispatch

by injecting a compute shader that validates the content of the indirect buffer

also adds missing indirect buffer offset validation
---
 deno_webgpu/binding.rs                      |   2 +
 deno_webgpu/shader.rs                       |   1 +
 tests/tests/dispatch_workgroups_indirect.rs | 197 +++++++++
 tests/tests/root.rs                         |   1 +
 wgpu-core/Cargo.toml                        |   4 +
 wgpu-core/src/binding_model.rs              |   7 +
 wgpu-core/src/command/bind.rs               |  14 +-
 wgpu-core/src/command/compute.rs            |  31 ++
 wgpu-core/src/command/compute_command.rs    |  16 +
 wgpu-core/src/command/mod.rs                |  13 +-
 wgpu-core/src/command/render.rs             |  16 +
 wgpu-core/src/device/global.rs              |  47 ++-
 wgpu-core/src/device/resource.rs            |  51 ++-
 wgpu-core/src/indirect_validation.rs        | 423 ++++++++++++++++++++
 wgpu-core/src/instance.rs                   |  34 +-
 wgpu-core/src/lib.rs                        |   2 +
 wgpu-core/src/pipeline.rs                   |   8 +-
 wgpu-core/src/resource.rs                   |   4 +
 wgpu/Cargo.toml                             |   6 +
 wgpu/src/backend/wgpu_core.rs               |   5 +
 20 files changed, 861 insertions(+), 21 deletions(-)
 create mode 100644 tests/tests/dispatch_workgroups_indirect.rs
 create mode 100644 wgpu-core/src/indirect_validation.rs

diff --git a/deno_webgpu/binding.rs b/deno_webgpu/binding.rs
index 0efeb6716a9..e5f6d6c613a 100644
--- a/deno_webgpu/binding.rs
+++ b/deno_webgpu/binding.rs
@@ -224,6 +224,7 @@ pub fn op_webgpu_create_pipeline_layout(
         label: Some(label),
         bind_group_layouts: Cow::from(bind_group_layouts),
         push_constant_ranges: Default::default(),
+        ignore_push_constant_check: false,
     };
 
     gfx_put!(device => instance.device_create_pipeline_layout(
@@ -288,6 +289,7 @@ pub fn op_webgpu_create_bind_group(
                                 buffer_id: buffer_resource.1,
                                 offset: entry.offset.unwrap_or(0),
                                 size: std::num::NonZeroU64::new(entry.size.unwrap_or(0)),
+                                allow_indirect_as_storage: false,
                             },
                         )
                     }
diff --git a/deno_webgpu/shader.rs b/deno_webgpu/shader.rs
index 17cde43936d..2a5abfa9b62 100644
--- a/deno_webgpu/shader.rs
+++ b/deno_webgpu/shader.rs
@@ -43,6 +43,7 @@ pub fn op_webgpu_create_shader_module(
     let descriptor = wgpu_core::pipeline::ShaderModuleDescriptor {
         label: Some(label),
         shader_bound_checks: wgpu_types::ShaderBoundChecks::default(),
+        ignore_push_constant_check: false,
     };
 
     gfx_put!(device => instance.device_create_shader_module(
diff --git a/tests/tests/dispatch_workgroups_indirect.rs b/tests/tests/dispatch_workgroups_indirect.rs
new file mode 100644
index 00000000000..f08f39b15aa
--- /dev/null
+++ b/tests/tests/dispatch_workgroups_indirect.rs
@@ -0,0 +1,197 @@
+use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext};
+
+/// Make sure that the num_workgroups builtin works properly (it requires a workaround on D3D12).
+#[gpu_test]
+static NUM_WORKGROUPS_BUILTIN: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(
+                wgpu::DownlevelFlags::COMPUTE_SHADERS | wgpu::DownlevelFlags::INDIRECT_EXECUTION,
+            )
+            .limits(wgpu::Limits::downlevel_defaults())
+            .expect_fail(FailureCase::backend(wgt::Backends::DX12)),
+    )
+    .run_async(|ctx| async move {
+        let num_workgroups = [1, 2, 3];
+        let res = run_test(&ctx, &num_workgroups, false).await;
+        assert_eq!(res, num_workgroups);
+    });
+
+/// Make sure that we discard (don't run) the dispatch if its size exceeds the device limit.
+#[gpu_test]
+static DISCARD_DISPATCH: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(
+                wgpu::DownlevelFlags::COMPUTE_SHADERS | wgpu::DownlevelFlags::INDIRECT_EXECUTION,
+            )
+            .limits(wgpu::Limits {
+                max_compute_workgroups_per_dimension: 10,
+                ..wgpu::Limits::downlevel_defaults()
+            }),
+    )
+    .run_async(|ctx| async move {
+        let max = ctx.device.limits().max_compute_workgroups_per_dimension;
+
+        let res = run_test(&ctx, &[max, max, max], false).await;
+        assert_eq!(res, [max; 3]);
+
+        let res = run_test(&ctx, &[max + 1, 1, 1], false).await;
+        assert_eq!(res, [0; 3]);
+
+        let res = run_test(&ctx, &[1, max + 1, 1], false).await;
+        assert_eq!(res, [0; 3]);
+
+        let res = run_test(&ctx, &[1, 1, max + 1], false).await;
+        assert_eq!(res, [0; 3]);
+    });
+
+/// Make sure that unsetting the bind group set by the validation code works properly.
+#[gpu_test]
+static UNSET_INTERNAL_BIND_GROUP: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .downlevel_flags(
+                wgpu::DownlevelFlags::COMPUTE_SHADERS | wgpu::DownlevelFlags::INDIRECT_EXECUTION,
+            )
+            .limits(wgpu::Limits::downlevel_defaults()),
+    )
+    .run_async(|ctx| async move {
+        ctx.device.push_error_scope(wgpu::ErrorFilter::Validation);
+
+        let _ = run_test(&ctx, &[0, 0, 0], true).await;
+
+        let error = pollster::block_on(ctx.device.pop_error_scope());
+        assert!(error.map_or(false, |error| format!("{error}")
+            .contains("Expected bind group is missing")));
+    });
+
+async fn run_test(
+    ctx: &TestingContext,
+    num_workgroups: &[u32; 3],
+    forget_to_set_bind_group: bool,
+) -> [u32; 3] {
+    const SHADER_SRC: &str = "
+        @group(0) @binding(0)
+        var<storage, read_write> out: array<u32, 3>;
+
+        @compute @workgroup_size(1)
+        fn main(@builtin(num_workgroups) num_workgroups: vec3<u32>, @builtin(workgroup_id) workgroup_id: vec3<u32>) {
+            if (all(workgroup_id == vec3<u32>())) {
+                out[0] = num_workgroups.x;
+                out[1] = num_workgroups.y;
+                out[2] = num_workgroups.z;
+            }
+        }
+    ";
+
+    let module = ctx
+        .device
+        .create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(SHADER_SRC.into()),
+        });
+
+    let pipeline = ctx
+        .device
+        .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: None,
+            layout: None,
+            module: &module,
+            entry_point: "main",
+            compilation_options: Default::default(),
+            cache: None,
+        });
+
+    let out_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: None,
+        size: 12,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+        mapped_at_creation: false,
+    });
+
+    let readback_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: None,
+        size: 12,
+        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+        mapped_at_creation: false,
+    });
+
+    let bind_group = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: None,
+        layout: &pipeline.get_bind_group_layout(0),
+        entries: &[wgpu::BindGroupEntry {
+            binding: 0,
+            resource: out_buffer.as_entire_binding(),
+        }],
+    });
+
+    let mut res = None;
+
+    for (indirect_offset, indirect_buffer_size) in [
+        // internal src buffer binding size will be buffer.size
+        (0, 12),
+        (4, 4 + 12),
+        (4, 8 + 12),
+        (256 * 2 - 4 - 12, 256 * 2 - 4),
+        // internal src buffer binding size will be 256 * 2 + x
+        (0, 256 * 2 * 2 + 4),
+        (256, 256 * 2 * 2 + 8),
+        (256 + 4, 256 * 2 * 2 + 12),
+        (256 * 2 + 16, 256 * 2 * 2 + 16),
+        (256 * 2 * 2, 256 * 2 * 2 + 32),
+        (256 + 12, 256 * 2 * 2 + 64),
+    ] {
+        let indirect_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: indirect_buffer_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::INDIRECT,
+            mapped_at_creation: false,
+        });
+
+        ctx.queue.write_buffer(
+            &indirect_buffer,
+            indirect_offset,
+            bytemuck::bytes_of(num_workgroups),
+        );
+
+        let mut encoder = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+        {
+            let mut compute_pass =
+                encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default());
+            compute_pass.set_pipeline(&pipeline);
+            if !forget_to_set_bind_group {
+                compute_pass.set_bind_group(0, &bind_group, &[]);
+            }
+            compute_pass.dispatch_workgroups_indirect(&indirect_buffer, indirect_offset);
+        }
+
+        encoder.copy_buffer_to_buffer(&out_buffer, 0, &readback_buffer, 0, 12);
+
+        ctx.queue.submit(Some(encoder.finish()));
+
+        readback_buffer
+            .slice(..)
+            .map_async(wgpu::MapMode::Read, |_| {});
+
+        ctx.async_poll(wgpu::Maintain::wait())
+            .await
+            .panic_on_timeout();
+
+        let view = readback_buffer.slice(..).get_mapped_range();
+
+        let current_res = *bytemuck::from_bytes(&view);
+        drop(view);
+        readback_buffer.unmap();
+
+        if let Some(past_res) = res {
+            assert_eq!(past_res, current_res);
+        } else {
+            res = Some(current_res);
+        }
+    }
+
+    res.unwrap()
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index 1cb5b56c7c0..d5f55132699 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -14,6 +14,7 @@ mod clear_texture;
 mod compute_pass_ownership;
 mod create_surface_error;
 mod device;
+mod dispatch_workgroups_indirect;
 mod encoder;
 mod external_texture;
 mod float32_filterable;
diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml
index f8c28b8793f..24da632a8dd 100644
--- a/wgpu-core/Cargo.toml
+++ b/wgpu-core/Cargo.toml
@@ -46,6 +46,10 @@ renderdoc = ["hal/renderdoc"]
 ## to the validation carried out at public APIs in all builds.
 strict_asserts = ["wgt/strict_asserts"]
 
+## Validates indirect draw/dispatch calls. This will also enable naga's
+## WGSL frontend since we use a WGSL compute shader to do the validation.
+indirect-validation = ["naga/wgsl-in"]
+
 ## Enables serialization via `serde` on common wgpu types.
 serde = ["dep:serde", "wgt/serde", "arrayvec/serde"]
 
diff --git a/wgpu-core/src/binding_model.rs b/wgpu-core/src/binding_model.rs
index 732c152dcfd..36ecfd5bf5f 100644
--- a/wgpu-core/src/binding_model.rs
+++ b/wgpu-core/src/binding_model.rs
@@ -617,6 +617,10 @@ pub struct PipelineLayoutDescriptor<'a> {
     /// [`Features::PUSH_CONSTANTS`](wgt::Features::PUSH_CONSTANTS) feature must
     /// be enabled.
     pub push_constant_ranges: Cow<'a, [wgt::PushConstantRange]>,
+    /// This is an internal flag used by indirect validation.
+    /// It allows usage of push constants without having the
+    /// [`Features::PUSH_CONSTANTS`](wgt::Features::PUSH_CONSTANTS) feature enabled.
+    pub ignore_push_constant_check: bool,
 }
 
 #[derive(Debug)]
@@ -758,6 +762,9 @@ pub struct BufferBinding {
     pub buffer_id: BufferId,
     pub offset: wgt::BufferAddress,
     pub size: Option<wgt::BufferSize>,
+    /// This is an internal flag used by indirect validation.
+    /// It allows indirect buffers to be bound as storage buffers.
+    pub allow_indirect_as_storage: bool,
 }
 
 // Note: Duplicated in `wgpu-rs` as `BindingResource`
diff --git a/wgpu-core/src/command/bind.rs b/wgpu-core/src/command/bind.rs
index c643611a967..75d566c5963 100644
--- a/wgpu-core/src/command/bind.rs
+++ b/wgpu-core/src/command/bind.rs
@@ -131,7 +131,7 @@ mod compat {
                         diff.push(format!("Expected {expected_bgl_type} bind group layout, got {assigned_bgl_type}"))
                     }
                 } else {
-                    diff.push("Assigned bind group layout not found (internal error)".to_owned());
+                    diff.push("Expected bind group is missing".to_owned());
                 }
             } else {
                 diff.push("Expected bind group layout not found (internal error)".to_owned());
@@ -191,6 +191,10 @@ mod compat {
             self.make_range(index)
         }
 
+        pub fn unassign(&mut self, index: usize) {
+            self.entries[index].assigned = None;
+        }
+
         pub fn list_active(&self) -> impl Iterator<Item = usize> + '_ {
             self.entries
                 .iter()
@@ -358,6 +362,14 @@ impl<A: HalApi> Binder<A> {
         &self.payloads[bind_range]
     }
 
+    pub(super) fn unassign_group(&mut self, index: usize) {
+        log::trace!("\tBinding [{}] = null", index);
+
+        self.payloads[index].reset();
+
+        self.manager.unassign(index);
+    }
+
     pub(super) fn list_active<'a>(&'a self) -> impl Iterator<Item = &'a Arc<BindGroup<A>>> + '_ {
         let payloads = &self.payloads;
         self.manager
diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs
index acbff0a0304..24cd716a007 100644
--- a/wgpu-core/src/command/compute.rs
+++ b/wgpu-core/src/command/compute.rs
@@ -182,6 +182,8 @@ pub enum ComputePassErrorInner {
     InvalidQuerySet(id::QuerySetId),
     #[error("Indirect buffer {0:?} is invalid or destroyed")]
     InvalidIndirectBuffer(id::BufferId),
+    #[error("Indirect buffer offset {0:?} is not a multiple of 4")]
+    UnalignedIndirectBufferOffset(BufferAddress),
     #[error("Indirect buffer uses bytes {offset}..{end_offset} which overruns indirect buffer of size {buffer_size}")]
     IndirectBufferOverrun {
         offset: u64,
@@ -473,6 +475,16 @@ impl Global {
             .map_pass_err(pass_scope);
         }
 
+        #[cfg(feature = "indirect-validation")]
+        let mut base = base;
+        #[cfg(feature = "indirect-validation")]
+        device
+            .indirect_validation
+            .get()
+            .unwrap()
+            .inject_dispatch_indirect_validation(device, &mut base)
+            .map_pass_err(pass_scope)?;
+
         let mut cmd_buf_data = cmd_buf.data.lock();
         let cmd_buf_data = cmd_buf_data.as_mut().unwrap();
 
@@ -654,6 +666,20 @@ impl Global {
                         }
                     }
                 }
+                ArcComputeCommand::UnsetBindGroup { index } => {
+                    let scope = PassErrorScope::UnsetBindGroup(index);
+
+                    let max_bind_groups = cmd_buf.limits.max_bind_groups;
+                    if index >= max_bind_groups {
+                        return Err(ComputePassErrorInner::BindGroupIndexOutOfRange {
+                            index,
+                            max: max_bind_groups,
+                        })
+                        .map_pass_err(scope);
+                    }
+
+                    state.binder.unassign_group(index as usize);
+                }
                 ArcComputeCommand::SetPipeline(pipeline) => {
                     let pipeline_id = pipeline.as_info().id();
                     let scope = PassErrorScope::SetPipelineCompute(pipeline_id);
@@ -811,6 +837,11 @@ impl Global {
                     check_buffer_usage(buffer_id, buffer.usage, wgt::BufferUsages::INDIRECT)
                         .map_pass_err(scope)?;
 
+                    if offset % 4 != 0 {
+                        return Err(ComputePassErrorInner::UnalignedIndirectBufferOffset(offset))
+                            .map_pass_err(scope);
+                    }
+
                     let end_offset = offset + mem::size_of::<wgt::DispatchIndirectArgs>() as u64;
                     if end_offset > buffer.size {
                         return Err(ComputePassErrorInner::IndirectBufferOverrun {
diff --git a/wgpu-core/src/command/compute_command.rs b/wgpu-core/src/command/compute_command.rs
index 49fdbbec24d..fd3b531628e 100644
--- a/wgpu-core/src/command/compute_command.rs
+++ b/wgpu-core/src/command/compute_command.rs
@@ -19,6 +19,10 @@ pub enum ComputeCommand {
         bind_group_id: id::BindGroupId,
     },
 
+    UnsetBindGroup {
+        index: u32,
+    },
+
     SetPipeline(id::ComputePipelineId),
 
     /// Set a range of push constants to values stored in `push_constant_data`.
@@ -103,6 +107,10 @@ impl ComputeCommand {
                         })?,
                     },
 
+                    ComputeCommand::UnsetBindGroup { index } => {
+                        ArcComputeCommand::UnsetBindGroup { index }
+                    }
+
                     ComputeCommand::SetPipeline(pipeline_id) => ArcComputeCommand::SetPipeline(
                         pipelines_guard
                             .get_owned(pipeline_id)
@@ -194,6 +202,10 @@ pub enum ArcComputeCommand<A: HalApi> {
         bind_group: Arc<BindGroup<A>>,
     },
 
+    UnsetBindGroup {
+        index: u32,
+    },
+
     SetPipeline(Arc<ComputePipeline<A>>),
 
     /// Set a range of push constants to values stored in `push_constant_data`.
@@ -261,6 +273,10 @@ impl<A: HalApi> From<&ArcComputeCommand<A>> for ComputeCommand {
                 bind_group_id: bind_group.as_info().id(),
             },
 
+            ArcComputeCommand::UnsetBindGroup { index } => {
+                ComputeCommand::UnsetBindGroup { index: *index }
+            }
+
             ArcComputeCommand::SetPipeline(pipeline) => {
                 ComputeCommand::SetPipeline(pipeline.as_info().id())
             }
diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs
index 874e207a278..9c2e5d35d7f 100644
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -15,8 +15,15 @@ use std::sync::Arc;
 
 pub(crate) use self::clear::clear_texture;
 pub use self::{
-    bundle::*, clear::ClearError, compute::*, compute_command::ComputeCommand, draw::*,
-    dyn_compute_pass::DynComputePass, query::*, render::*, transfer::*,
+    bundle::*,
+    clear::ClearError,
+    compute::*,
+    compute_command::{ArcComputeCommand, ComputeCommand},
+    draw::*,
+    dyn_compute_pass::DynComputePass,
+    query::*,
+    render::*,
+    transfer::*,
 };
 pub(crate) use allocator::CommandAllocator;
 
@@ -892,6 +899,8 @@ pub enum PassErrorScope {
     Pass(Option<id::CommandBufferId>),
     #[error("In a set_bind_group command")]
     SetBindGroup(id::BindGroupId),
+    #[error("In a unset_bind_group command, slot: {0}")]
+    UnsetBindGroup(u32),
     #[error("In a set_pipeline command")]
     SetPipelineRender(id::RenderPipelineId),
     #[error("In a set_pipeline command")]
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index defd6a608ba..71b1beeb0e8 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -616,6 +616,8 @@ pub enum RenderPassErrorInner {
     MissingFeatures(#[from] MissingFeatures),
     #[error(transparent)]
     MissingDownlevelFlags(#[from] MissingDownlevelFlags),
+    #[error("Indirect buffer offset {0:?} is not a multiple of 4")]
+    UnalignedIndirectBufferOffset(BufferAddress),
     #[error("Indirect draw uses bytes {offset}..{end_offset} {} which overruns indirect buffer of size {buffer_size}",
         count.map_or_else(String::new, |v| format!("(using count {v})")))]
     IndirectBufferOverrun {
@@ -2050,6 +2052,13 @@ impl Global {
 
                         let actual_count = count.map_or(1, |c| c.get());
 
+                        if offset % 4 != 0 {
+                            return Err(RenderPassErrorInner::UnalignedIndirectBufferOffset(
+                                offset,
+                            ))
+                            .map_pass_err(scope);
+                        }
+
                         let end_offset = offset + stride as u64 * actual_count as u64;
                         if end_offset > indirect_buffer.size {
                             return Err(RenderPassErrorInner::IndirectBufferOverrun {
@@ -2141,6 +2150,13 @@ impl Global {
                             .ok_or(RenderCommandError::DestroyedBuffer(count_buffer_id))
                             .map_pass_err(scope)?;
 
+                        if offset % 4 != 0 {
+                            return Err(RenderPassErrorInner::UnalignedIndirectBufferOffset(
+                                offset,
+                            ))
+                            .map_pass_err(scope);
+                        }
+
                         let end_offset = offset + stride * max_count as u64;
                         if end_offset > indirect_buffer.size {
                             return Err(RenderPassErrorInner::IndirectBufferOverrun {
diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs
index a5c51b269f7..5c460e3f817 100644
--- a/wgpu-core/src/device/global.rs
+++ b/wgpu-core/src/device/global.rs
@@ -262,6 +262,51 @@ impl Global {
             let (id, resource) = fid.assign(Arc::new(buffer));
             api_log!("Device::create_buffer({desc:?}) -> {id:?}");
 
+            #[cfg(feature = "indirect-validation")]
+            if desc.usage.contains(wgt::BufferUsages::INDIRECT) {
+                // We create an indirect buffer in IndirectValidation's constructor,
+                // device.indirect_validation won't yet be set.
+                if let Some(indirect_validation) = device.indirect_validation.get() {
+                    let binding_size =
+                        crate::indirect_validation::IndirectValidation::calculate_src_buffer_binding_size(&device, &resource);
+                    let (bg_id, error) = self.device_create_bind_group::<A>(
+                        device_id,
+                        &crate::binding_model::BindGroupDescriptor {
+                            label: None,
+                            layout: indirect_validation.src_bind_group_layout,
+                            entries: std::borrow::Cow::Borrowed(&[
+                                crate::binding_model::BindGroupEntry {
+                                    binding: 0,
+                                    resource: crate::binding_model::BindingResource::Buffer(
+                                        crate::binding_model::BufferBinding {
+                                            buffer_id: id,
+                                            offset: 0,
+                                            size: Some(
+                                                std::num::NonZeroU64::new(binding_size).unwrap(),
+                                            ),
+                                            allow_indirect_as_storage: true,
+                                        },
+                                    ),
+                                },
+                            ]),
+                        },
+                        None,
+                    );
+                    if let Some(error) = error {
+                        let buffer = hub.buffers.write().replace_with_error(id).unwrap();
+                        device
+                            .lock_life()
+                            .schedule_resource_destruction(queue::TempResource::Buffer(buffer), !0);
+                        return (
+                            id,
+                            Some(CreateBufferError::IndirectValidationBindGroup(error)),
+                        );
+                    }
+                    let bg = hub.bind_groups.write().remove(bg_id).unwrap();
+                    resource.indirect_validation_bind_group.set(bg).unwrap();
+                }
+            }
+
             device
                 .trackers
                 .lock()
@@ -1215,7 +1260,7 @@ impl Global {
             #[cfg(feature = "trace")]
             if let Some(ref mut trace) = *device.trace.lock() {
                 let data = match source {
-                    #[cfg(feature = "wgsl")]
+                    #[cfg(any(feature = "wgsl", feature = "indirect-validation"))]
                     pipeline::ShaderModuleSource::Wgsl(ref code) => {
                         trace.make_binary("wgsl", code.as_bytes())
                     }
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index f9242848c87..2ca2cc495c3 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -96,6 +96,8 @@ pub struct Device<A: HalApi> {
     pub(crate) queue: OnceCell<Weak<Queue<A>>>,
     queue_to_drop: OnceCell<A::Queue>,
     pub(crate) zero_buffer: Option<A::Buffer>,
+    #[cfg(feature = "indirect-validation")]
+    pub(crate) indirect_validation: OnceCell<crate::indirect_validation::IndirectValidation<A>>,
     pub(crate) info: ResourceInfo<Device<A>>,
 
     pub(crate) command_allocator: command::CommandAllocator<A>,
@@ -270,6 +272,8 @@ impl<A: HalApi> Device<A> {
             queue: OnceCell::new(),
             queue_to_drop: OnceCell::new(),
             zero_buffer: Some(zero_buffer),
+            #[cfg(feature = "indirect-validation")]
+            indirect_validation: OnceCell::new(),
             info: ResourceInfo::new("<device>", None),
             command_allocator,
             active_submission_index: AtomicU64::new(0),
@@ -588,6 +592,12 @@ impl<A: HalApi> Device<A> {
             return Err(resource::CreateBufferError::InvalidUsage(desc.usage));
         }
 
+        if desc.usage.contains(wgt::BufferUsages::INDIRECT) {
+            // We are going to be reading from it, internally;
+            // when validating the content of the buffer
+            usage |= hal::BufferUses::STORAGE_READ | hal::BufferUses::STORAGE_READ_WRITE;
+        }
+
         if !self
             .features
             .contains(wgt::Features::MAPPABLE_PRIMARY_BUFFERS)
@@ -659,6 +669,8 @@ impl<A: HalApi> Device<A> {
                 Some(self.tracker_indices.buffers.clone()),
             ),
             bind_groups: Mutex::new(rank::BUFFER_BIND_GROUPS, Vec::new()),
+            #[cfg(feature = "indirect-validation")]
+            indirect_validation_bind_group: OnceCell::new(),
         })
     }
 
@@ -719,6 +731,8 @@ impl<A: HalApi> Device<A> {
                 Some(self.tracker_indices.buffers.clone()),
             ),
             bind_groups: Mutex::new(rank::BUFFER_BIND_GROUPS, Vec::new()),
+            #[cfg(feature = "indirect-validation")]
+            indirect_validation_bind_group: OnceCell::new(),
         }
     }
 
@@ -1416,7 +1430,7 @@ impl<A: HalApi> Device<A> {
         source: pipeline::ShaderModuleSource<'a>,
     ) -> Result<pipeline::ShaderModule<A>, pipeline::CreateShaderModuleError> {
         let (module, source) = match source {
-            #[cfg(feature = "wgsl")]
+            #[cfg(any(feature = "wgsl", feature = "indirect-validation"))]
             pipeline::ShaderModuleSource::Wgsl(code) => {
                 profiling::scope!("naga::front::wgsl::parse_str");
                 let module = naga::front::wgsl::parse_str(&code).map_err(|inner| {
@@ -1486,7 +1500,12 @@ impl<A: HalApi> Device<A> {
             };
 
         let info = create_validator(
-            self.features,
+            self.features
+                | if desc.ignore_push_constant_check {
+                    wgt::Features::PUSH_CONSTANTS
+                } else {
+                    wgt::Features::empty()
+                },
             self.downlevel.flags,
             naga::valid::ValidationFlags::all(),
         )
@@ -1866,7 +1885,7 @@ impl<A: HalApi> Device<A> {
             }
         };
 
-        let (pub_usage, internal_use, range_limit) = match binding_ty {
+        let (mut pub_usage, internal_use, range_limit) = match binding_ty {
             wgt::BufferBindingType::Uniform => (
                 wgt::BufferUsages::UNIFORM,
                 hal::BufferUses::UNIFORM,
@@ -1902,7 +1921,14 @@ impl<A: HalApi> Device<A> {
             return Err(DeviceError::WrongDevice.into());
         }
 
+        // Allow indirect buffers to be bound as storage buffers so that
+        // we can validate their content. Note that we already pass this
+        // usage to hal at buffer creation.
+        if buffer.usage.contains(wgt::BufferUsages::INDIRECT) && bb.allow_indirect_as_storage {
+            pub_usage = pub_usage.difference(wgt::BufferUsages::STORAGE);
+        }
         check_buffer_usage(bb.buffer_id, buffer.usage, pub_usage)?;
+
         let raw_buffer = buffer
             .raw
             .get(snatch_guard)
@@ -2485,7 +2511,7 @@ impl<A: HalApi> Device<A> {
             });
         }
 
-        if !desc.push_constant_ranges.is_empty() {
+        if !desc.ignore_push_constant_check && !desc.push_constant_ranges.is_empty() {
             self.require_features(wgt::Features::PUSH_CONSTANTS)?;
         }
 
@@ -2500,13 +2526,15 @@ impl<A: HalApi> Device<A> {
             }
             used_stages |= pc.stages;
 
-            let device_max_pc_size = self.limits.max_push_constant_size;
-            if device_max_pc_size < pc.range.end {
-                return Err(Error::PushConstantRangeTooLarge {
-                    index,
-                    range: pc.range.clone(),
-                    max: device_max_pc_size,
-                });
+            if !desc.ignore_push_constant_check {
+                let device_max_pc_size = self.limits.max_push_constant_size;
+                if device_max_pc_size < pc.range.end {
+                    return Err(Error::PushConstantRangeTooLarge {
+                        index,
+                        range: pc.range.clone(),
+                        max: device_max_pc_size,
+                    });
+                }
             }
 
             if pc.range.start % wgt::PUSH_CONSTANT_ALIGNMENT != 0 {
@@ -2617,6 +2645,7 @@ impl<A: HalApi> Device<A> {
             label: None,
             bind_group_layouts: Cow::Borrowed(&ids.group_ids[..group_count]),
             push_constant_ranges: Cow::Borrowed(&[]), //TODO?
+            ignore_push_constant_check: false,
         };
         let layout = self.create_pipeline_layout(&layout_desc, bgl_registry)?;
         pipeline_layout_registry.force_replace(ids.root_id, layout);
diff --git a/wgpu-core/src/indirect_validation.rs b/wgpu-core/src/indirect_validation.rs
new file mode 100644
index 00000000000..7d97d111bea
--- /dev/null
+++ b/wgpu-core/src/indirect_validation.rs
@@ -0,0 +1,423 @@
+use std::{mem, sync::Arc};
+
+use thiserror::Error;
+use wgt::{PushConstantRange, ShaderStages};
+
+use crate::{
+    binding_model::{
+        BindGroup, CreateBindGroupError, CreateBindGroupLayoutError, CreatePipelineLayoutError,
+    },
+    command::{ArcComputeCommand, BasePass, ComputePassErrorInner},
+    device::{Device, DeviceError},
+    global::Global,
+    hal_api::HalApi,
+    id,
+    pipeline::{ComputePipeline, CreateComputePipelineError, CreateShaderModuleError},
+    resource::{Buffer, CreateBufferError, Resource},
+    validation::check_buffer_usage,
+};
+
+#[derive(Clone, Debug, Error)]
+#[non_exhaustive]
+pub enum CreateDispatchIndirectValidationPipelineError {
+    #[error(transparent)]
+    Device(#[from] DeviceError),
+    #[error(transparent)]
+    ShaderModule(#[from] CreateShaderModuleError),
+    #[error(transparent)]
+    BindGroupLayout(#[from] CreateBindGroupLayoutError),
+    #[error(transparent)]
+    PipelineLayout(#[from] CreatePipelineLayoutError),
+    #[error(transparent)]
+    ComputePipeline(#[from] CreateComputePipelineError),
+    #[error(transparent)]
+    Buffer(#[from] CreateBufferError),
+    #[error(transparent)]
+    BindGroup(#[from] CreateBindGroupError),
+    #[error("invalid id")]
+    InvalidId,
+}
+
+#[derive(Debug)]
+pub struct IndirectValidation<A: HalApi> {
+    pub pipeline: Arc<ComputePipeline<A>>,
+    pub dst_buffer: Arc<Buffer<A>>,
+    pub dst_bind_group: Arc<BindGroup<A>>,
+    pub src_bind_group_layout: id::BindGroupLayoutId,
+}
+
+impl<A: HalApi> IndirectValidation<A> {
+    pub fn new(
+        global: &Global,
+        device: Arc<Device<A>>,
+    ) -> Result<Self, CreateDispatchIndirectValidationPipelineError> {
+        let max_compute_workgroups_per_dimension =
+            device.limits.max_compute_workgroups_per_dimension;
+
+        let src = format!("
+            @group(0) @binding(0)
+            var<storage, read_write> dst: array<u32, 3>;
+            @group(1) @binding(0)
+            var<storage, read> src: array<u32>;
+            struct OffsetPc {{
+                inner: u32,
+            }}
+            var<push_constant> offset: OffsetPc;
+
+            @compute @workgroup_size(1)
+            fn main() {{
+                let src = vec3(src[offset.inner], src[offset.inner + 1], src[offset.inner + 2]);
+                let res = select(src, vec3<u32>(), src > vec3({max_compute_workgroups_per_dimension}u));
+                dst[0] = res.x;
+                dst[1] = res.y;
+                dst[2] = res.z;
+            }}
+        ");
+
+        let device_id = device.info.id();
+
+        let (module, error) = global.device_create_shader_module::<A>(
+            device_id,
+            &crate::pipeline::ShaderModuleDescriptor {
+                label: None,
+                shader_bound_checks: wgt::ShaderBoundChecks::default(),
+                ignore_push_constant_check: true,
+            },
+            crate::pipeline::ShaderModuleSource::Wgsl(std::borrow::Cow::Owned(src)),
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (dst_bind_group_layout, error) = global.device_create_bind_group_layout::<A>(
+            device_id,
+            &crate::binding_model::BindGroupLayoutDescriptor {
+                label: None,
+                entries: std::borrow::Cow::Borrowed(&[wgt::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgt::ShaderStages::COMPUTE,
+                    ty: wgt::BindingType::Buffer {
+                        ty: wgt::BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(std::num::NonZeroU64::new(4 * 3).unwrap()),
+                    },
+                    count: None,
+                }]),
+            },
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (src_bind_group_layout, error) = global.device_create_bind_group_layout::<A>(
+            device_id,
+            &crate::binding_model::BindGroupLayoutDescriptor {
+                label: None,
+                entries: std::borrow::Cow::Borrowed(&[wgt::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgt::ShaderStages::COMPUTE,
+                    ty: wgt::BindingType::Buffer {
+                        ty: wgt::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: true,
+                        min_binding_size: Some(std::num::NonZeroU64::new(4 * 3).unwrap()),
+                    },
+                    count: None,
+                }]),
+            },
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (layout, error) = global.device_create_pipeline_layout::<A>(
+            device_id,
+            &crate::binding_model::PipelineLayoutDescriptor {
+                label: None,
+                bind_group_layouts: std::borrow::Cow::Borrowed(&[
+                    dst_bind_group_layout,
+                    src_bind_group_layout,
+                ]),
+                push_constant_ranges: std::borrow::Cow::Borrowed(&[PushConstantRange {
+                    stages: ShaderStages::COMPUTE,
+                    range: 0..4,
+                }]),
+                ignore_push_constant_check: true,
+            },
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (pipeline, error) = global.device_create_compute_pipeline::<A>(
+            device_id,
+            &crate::pipeline::ComputePipelineDescriptor {
+                label: None,
+                layout: Some(layout),
+                stage: crate::pipeline::ProgrammableStageDescriptor {
+                    module,
+                    entry_point: Some(std::borrow::Cow::Borrowed("main")),
+                    constants: Default::default(),
+                    zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: true,
+                },
+                cache: None,
+            },
+            None,
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (dst_buffer_id, error) = global.device_create_buffer::<A>(
+            device_id,
+            &crate::resource::BufferDescriptor {
+                label: None,
+                size: 4 * 3,
+                usage: wgt::BufferUsages::INDIRECT | wgt::BufferUsages::STORAGE,
+                mapped_at_creation: false,
+            },
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let (dst_bind_group_id, error) = global.device_create_bind_group::<A>(
+            device_id,
+            &crate::binding_model::BindGroupDescriptor {
+                label: None,
+                layout: dst_bind_group_layout,
+                entries: std::borrow::Cow::Borrowed(&[crate::binding_model::BindGroupEntry {
+                    binding: 0,
+                    resource: crate::binding_model::BindingResource::Buffer(
+                        crate::binding_model::BufferBinding {
+                            buffer_id: dst_buffer_id,
+                            offset: 0,
+                            size: Some(std::num::NonZeroU64::new(4 * 3).unwrap()),
+                            allow_indirect_as_storage: false,
+                        },
+                    ),
+                }]),
+            },
+            None,
+        );
+        if let Some(error) = error {
+            return Err(error.into());
+        }
+
+        let hub = A::hub(global);
+
+        let pipeline = hub
+            .compute_pipelines
+            .write()
+            .remove(pipeline)
+            .ok_or(CreateDispatchIndirectValidationPipelineError::InvalidId)?;
+
+        let dst_buffer = hub
+            .buffers
+            .write()
+            .remove(dst_buffer_id)
+            .ok_or(CreateDispatchIndirectValidationPipelineError::InvalidId)?;
+
+        let dst_bind_group = hub
+            .bind_groups
+            .write()
+            .remove(dst_bind_group_id)
+            .ok_or(CreateDispatchIndirectValidationPipelineError::InvalidId)?;
+
+        Ok(Self {
+            pipeline,
+            dst_buffer,
+            dst_bind_group,
+            src_bind_group_layout,
+        })
+    }
+
+    pub(crate) fn calculate_src_buffer_binding_size(device: &Device<A>, buffer: &Buffer<A>) -> u64 {
+        let alignment = device.limits.min_storage_buffer_offset_alignment as u64;
+
+        // We need to choose a binding size that can address all possible sets of 12 contiguous bytes in the buffer taking
+        // into account that the dynamic offset needs to be a multiple of `min_storage_buffer_offset_alignment`.
+
+        // Given the know variables: `offset`, `buffer_size`, `alignment` and the rule `offset + 12 <= buffer_size`.
+
+        // Let `chunks = floor(buffer_size / alignment)`.
+        // Let `chunk` be the interval `[0, chunks]`.
+        // Let `offset = alignment * chunk + r` where `r` is the interval [0, alignment - 4].
+        // Let `binding` be the interval `[offset, offset + 12]`.
+        // Let `aligned_offset = alignment * chunk`.
+        // Let `aligned_binding` be the interval `[aligned_offset, aligned_offset + r + 12]`.
+        // Let `aligned_binding_size = r + 12 = [12, alignment + 8]`.
+        // Let `min_aligned_binding_size = alignment + 8`.
+
+        // `min_aligned_binding_size` is the minimum binding size required to address all 12 contiguous bytes in the buffer
+        // but the last aligned_offset + min_aligned_binding_size might overflow the buffer. In order to avoid this we must
+        // pick a larger `binding_size` that satisfies: `last_aligned_offset + binding_size = buffer_size` and
+        // `binding_size >= min_aligned_binding_size`.
+
+        // Let `buffer_size = alignment * chunks + sr` where `sr` is the interval [0, alignment - 4].
+        // Let `last_aligned_offset = alignment * (chunks - u)` where `u` is the interval [0, chunks].
+        // => `binding_size = buffer_size - last_aligned_offset`
+        // => `binding_size = alignment * chunks + sr - alignment * (chunks - u)`
+        // => `binding_size = alignment * chunks + sr - alignment * chunks + alignment * u`
+        // => `binding_size = sr + alignment * u`
+        // => `min_aligned_binding_size <= sr + alignment * u`
+        // => `alignment + 8 <= sr + alignment * u`
+        // => `u` must be at least 2
+        // => `binding_size = sr + alignment * 2`
+
+        let binding_size = 2 * alignment + (buffer.size % alignment);
+        binding_size.min(buffer.size)
+    }
+
+    pub fn inject_dispatch_indirect_validation(
+        &self,
+        device: &Device<A>,
+        base: &mut BasePass<ArcComputeCommand<A>>,
+    ) -> Result<(), ComputePassErrorInner> {
+        if !base
+            .commands
+            .iter()
+            .any(|cmd| matches!(cmd, ArcComputeCommand::DispatchIndirect { .. }))
+        {
+            return Ok(());
+        }
+
+        profiling::scope!("CommandEncoder::inject_dispatch_indirect_validation");
+
+        let mut new_commands = Vec::with_capacity(base.commands.len());
+        let mut current_pipeline = None;
+        let mut current_first_2_bind_groups = [None, None];
+
+        for command in base.commands.drain(..) {
+            match command {
+                ArcComputeCommand::SetBindGroup {
+                    index,
+                    num_dynamic_offsets,
+                    ref bind_group,
+                } => {
+                    if index == 0 || index == 1 {
+                        current_first_2_bind_groups[index as usize] =
+                            Some((num_dynamic_offsets, bind_group.clone()));
+                    }
+                    new_commands.push(command);
+                }
+                ArcComputeCommand::SetPipeline(ref pipeline) => {
+                    current_pipeline = Some(pipeline.clone());
+                    new_commands.push(command);
+                }
+                ArcComputeCommand::DispatchIndirect { ref buffer, offset } => {
+                    // if there is no pipeline set, don't inject the validation commands as we will error anyway
+                    if let Some(original_pipeline) = current_pipeline.clone() {
+                        // validate some buffer properties that won't be validated later
+                        check_buffer_usage(
+                            buffer.as_info().id(),
+                            buffer.usage,
+                            wgt::BufferUsages::INDIRECT,
+                        )?;
+
+                        if offset % 4 != 0 {
+                            return Err(ComputePassErrorInner::UnalignedIndirectBufferOffset(
+                                offset,
+                            ));
+                        }
+
+                        let end_offset =
+                            offset + mem::size_of::<wgt::DispatchIndirectArgs>() as u64;
+                        if end_offset > buffer.size {
+                            return Err(ComputePassErrorInner::IndirectBufferOverrun {
+                                offset,
+                                end_offset,
+                                buffer_size: buffer.size,
+                            });
+                        }
+
+                        // The offset we receive is only required to be aligned to 4 bytes.
+                        //
+                        // Binding offsets and dynamic offsets are required to be aligned to
+                        // min_storage_buffer_offset_alignment (256 bytes by default).
+                        //
+                        // So, we work around this limitation by calculating an aligned offset
+                        // and pass the remainder through a push constant.
+                        //
+                        // We could bind the whole buffer and only have to pass the offset
+                        // through a push constant but we might run into the
+                        // max_storage_buffer_binding_size limit.
+                        //
+                        // See the inner docs of `calculate_src_buffer_binding_size` to
+                        // see how we get the appropriate `binding_size`.
+                        let alignment = device.limits.min_storage_buffer_offset_alignment as u64;
+                        let binding_size = Self::calculate_src_buffer_binding_size(device, buffer);
+                        let aligned_offset = offset - offset % alignment;
+                        // This works because `binding_size` is either `buffer.size` or `alignment * 2 + buffer.size % alignment`.
+                        let max_aligned_offset = buffer.size - binding_size;
+                        let aligned_offset = aligned_offset.min(max_aligned_offset);
+                        let offset_remainder = offset - aligned_offset;
+
+                        new_commands.push(ArcComputeCommand::SetPipeline(self.pipeline.clone()));
+
+                        base.dynamic_offsets.push(aligned_offset as u32);
+
+                        let values_offset = base.push_constant_data.len() as u32;
+                        base.push_constant_data.push(offset_remainder as u32 / 4);
+
+                        new_commands.push(ArcComputeCommand::SetPushConstant {
+                            offset: 0,
+                            size_bytes: 4,
+                            values_offset,
+                        });
+                        new_commands.push(ArcComputeCommand::SetBindGroup {
+                            index: 0,
+                            num_dynamic_offsets: 0,
+                            bind_group: self.dst_bind_group.clone(),
+                        });
+                        new_commands.push(ArcComputeCommand::SetBindGroup {
+                            index: 1,
+                            num_dynamic_offsets: 1,
+                            bind_group: buffer
+                                .indirect_validation_bind_group
+                                .get()
+                                .unwrap()
+                                .clone(),
+                        });
+                        new_commands.push(ArcComputeCommand::Dispatch([1, 1, 1]));
+
+                        new_commands.push(ArcComputeCommand::SetPipeline(original_pipeline));
+                        for (index, current_bind_group) in
+                            current_first_2_bind_groups.iter().enumerate()
+                        {
+                            if let Some((num_dynamic_offsets, bind_group)) =
+                                current_bind_group.clone()
+                            {
+                                new_commands.push(ArcComputeCommand::SetBindGroup {
+                                    index: index as u32,
+                                    num_dynamic_offsets,
+                                    bind_group,
+                                });
+                            } else {
+                                new_commands.push(ArcComputeCommand::UnsetBindGroup {
+                                    index: index as u32,
+                                });
+                            }
+                        }
+                        new_commands.push(ArcComputeCommand::DispatchIndirect {
+                            buffer: self.dst_buffer.clone(),
+                            offset: 0,
+                        });
+                    } else {
+                        new_commands.push(command)
+                    }
+                }
+                command => new_commands.push(command),
+            }
+        }
+        base.commands = new_commands;
+
+        Ok(())
+    }
+}
diff --git a/wgpu-core/src/instance.rs b/wgpu-core/src/instance.rs
index 5d21ed0398f..86965e52211 100644
--- a/wgpu-core/src/instance.rs
+++ b/wgpu-core/src/instance.rs
@@ -1109,10 +1109,9 @@ impl Global {
                     Ok((device, queue)) => (device, queue),
                     Err(e) => break e,
                 };
-            let (device_id, _) = device_fid.assign(Arc::new(device));
+            let (device_id, device) = device_fid.assign(Arc::new(device));
             resource_log!("Created Device {:?}", device_id);
 
-            let device = hub.devices.get(device_id).unwrap();
             queue.device = Some(device.clone());
 
             let (queue_id, queue) = queue_fid.assign(Arc::new(queue));
@@ -1120,6 +1119,20 @@ impl Global {
 
             device.set_queue(queue);
 
+            #[cfg(feature = "indirect-validation")]
+            match crate::indirect_validation::IndirectValidation::new(self, device.clone()) {
+                Ok(indirect_validation) => {
+                    device.indirect_validation.set(indirect_validation).unwrap();
+                }
+                Err(_) => {
+                    hub.devices
+                        .force_replace_with_error(device_id, desc.label.borrow_or_default());
+                    hub.queues
+                        .force_replace_with_error(queue_id, desc.label.borrow_or_default());
+                    return (device_id, queue_id, Some(RequestDeviceError::Internal));
+                }
+            };
+
             return (device_id, queue_id, None);
         };
 
@@ -1161,10 +1174,9 @@ impl Global {
                 Ok(device) => device,
                 Err(e) => break e,
             };
-            let (device_id, _) = devices_fid.assign(Arc::new(device));
+            let (device_id, device) = devices_fid.assign(Arc::new(device));
             resource_log!("Created Device {:?}", device_id);
 
-            let device = hub.devices.get(device_id).unwrap();
             queue.device = Some(device.clone());
 
             let (queue_id, queue) = queues_fid.assign(Arc::new(queue));
@@ -1172,6 +1184,20 @@ impl Global {
 
             device.set_queue(queue);
 
+            #[cfg(feature = "indirect-validation")]
+            match crate::indirect_validation::IndirectValidation::new(self, device.clone()) {
+                Ok(indirect_validation) => {
+                    device.indirect_validation.set(indirect_validation).unwrap();
+                }
+                Err(_) => {
+                    hub.devices
+                        .force_replace_with_error(device_id, desc.label.borrow_or_default());
+                    hub.queues
+                        .force_replace_with_error(queue_id, desc.label.borrow_or_default());
+                    return (device_id, queue_id, Some(RequestDeviceError::Internal));
+                }
+            };
+
             return (device_id, queue_id, None);
         };
 
diff --git a/wgpu-core/src/lib.rs b/wgpu-core/src/lib.rs
index ebf80091c3f..ad85b5d0cdd 100644
--- a/wgpu-core/src/lib.rs
+++ b/wgpu-core/src/lib.rs
@@ -61,6 +61,8 @@ mod hash_utils;
 pub mod hub;
 pub mod id;
 pub mod identity;
+#[cfg(feature = "indirect-validation")]
+mod indirect_validation;
 mod init_tracker;
 pub mod instance;
 mod lock;
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index f3e7dbacb27..ab3f5045610 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -26,7 +26,7 @@ pub(crate) struct LateSizedBufferGroup {
 
 #[allow(clippy::large_enum_variant)]
 pub enum ShaderModuleSource<'a> {
-    #[cfg(feature = "wgsl")]
+    #[cfg(any(feature = "wgsl", feature = "indirect-validation"))]
     Wgsl(Cow<'a, str>),
     #[cfg(feature = "glsl")]
     Glsl(Cow<'a, str>, naga::front::glsl::Options),
@@ -45,6 +45,10 @@ pub struct ShaderModuleDescriptor<'a> {
     pub label: Label<'a>,
     #[cfg_attr(feature = "serde", serde(default))]
     pub shader_bound_checks: wgt::ShaderBoundChecks,
+    /// This is an internal flag used by indirect validation.
+    /// It allows usage of push constants without having the
+    /// [`Features::PUSH_CONSTANTS`](wgt::Features::PUSH_CONSTANTS) feature enabled.
+    pub ignore_push_constant_check: bool,
 }
 
 #[derive(Debug)]
@@ -113,7 +117,7 @@ impl<A: HalApi> ShaderModule<A> {
 #[derive(Clone, Debug, Error)]
 #[non_exhaustive]
 pub enum CreateShaderModuleError {
-    #[cfg(feature = "wgsl")]
+    #[cfg(any(feature = "wgsl", feature = "indirect-validation"))]
     #[error(transparent)]
     Parsing(#[from] ShaderError<naga::front::wgsl::ParseError>),
     #[cfg(feature = "glsl")]
diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs
index 9ae275615ad..4481769e01b 100644
--- a/wgpu-core/src/resource.rs
+++ b/wgpu-core/src/resource.rs
@@ -400,6 +400,8 @@ pub struct Buffer<A: HalApi> {
     pub(crate) info: ResourceInfo<Buffer<A>>,
     pub(crate) map_state: Mutex<BufferMapState<A>>,
     pub(crate) bind_groups: Mutex<Vec<Weak<BindGroup<A>>>>,
+    #[cfg(feature = "indirect-validation")]
+    pub(crate) indirect_validation_bind_group: once_cell::sync::OnceCell<Arc<BindGroup<A>>>,
 }
 
 impl<A: HalApi> Drop for Buffer<A> {
@@ -611,6 +613,8 @@ pub enum CreateBufferError {
     MaxBufferSize { requested: u64, maximum: u64 },
     #[error(transparent)]
     MissingDownlevelFlags(#[from] MissingDownlevelFlags),
+    #[error("Failed to create bind group for indirect buffer validation: {0}")]
+    IndirectValidationBindGroup(#[from] crate::binding_model::CreateBindGroupError),
 }
 
 impl<A: HalApi> Resource for Buffer<A> {
diff --git a/wgpu/Cargo.toml b/wgpu/Cargo.toml
index 81927f0a632..05c287dcd60 100644
--- a/wgpu/Cargo.toml
+++ b/wgpu/Cargo.toml
@@ -124,6 +124,12 @@ features = ["raw-window-handle"]
 workspace = true
 features = ["raw-window-handle"]
 
+# If we are not targeting WebGL, enable indirect-validation.
+# WebGL doesn't support indirect execution so this is not needed.
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies.wgc]
+workspace = true
+features = ["indirect-validation"]
+
 # Enable `wgc` by default on macOS and iOS to allow the `metal` crate feature to
 # enable the Metal backend while being no-op on other targets.
 [target.'cfg(any(target_os = "macos", target_os = "ios"))'.dependencies.wgc]
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index e00bd4a3848..a230d06937e 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -892,6 +892,7 @@ impl crate::Context for ContextWgpuCore {
         let descriptor = wgc::pipeline::ShaderModuleDescriptor {
             label: desc.label.map(Borrowed),
             shader_bound_checks,
+            ignore_push_constant_check: false,
         };
         let source = match desc.source {
             #[cfg(feature = "spirv")]
@@ -950,6 +951,7 @@ impl crate::Context for ContextWgpuCore {
             // Doesn't matter the value since spirv shaders aren't mutated to include
             // runtime checks
             shader_bound_checks: unsafe { wgt::ShaderBoundChecks::unchecked() },
+            ignore_push_constant_check: false,
         };
         let (id, error) = wgc::gfx_select!(
             device => self.0.device_create_shader_module_spirv(*device, &descriptor, Borrowed(&desc.source), None)
@@ -1033,6 +1035,7 @@ impl crate::Context for ContextWgpuCore {
                         buffer_id: binding.buffer.id.into(),
                         offset: binding.offset,
                         size: binding.size,
+                        allow_indirect_as_storage: false,
                     }));
                 }
             }
@@ -1053,6 +1056,7 @@ impl crate::Context for ContextWgpuCore {
                         buffer_id: buffer.id.into(),
                         offset,
                         size,
+                        allow_indirect_as_storage: false,
                     }),
                     BindingResource::BufferArray(array) => {
                         let slice = &remaining_arrayed_buffer_bindings[..array.len()];
@@ -1132,6 +1136,7 @@ impl crate::Context for ContextWgpuCore {
             label: desc.label.map(Borrowed),
             bind_group_layouts: Borrowed(&temp_layouts),
             push_constant_ranges: Borrowed(desc.push_constant_ranges),
+            ignore_push_constant_check: false,
         };
 
         let (id, error) = wgc::gfx_select!(device => self.0.device_create_pipeline_layout(