Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero-initialize workgroup memory #3174

Merged
merged 2 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ let texture = device.create_texture(&wgpu::TextureDescriptor {
- Implemented correleation between user timestamps and platform specific presentation timestamps via [`Adapter::get_presentation_timestamp`]. By @cwfitzgerald in [#3240](https://github.com/gfx-rs/wgpu/pull/3240)
- Added support for `Features::SHADER_PRIMITIVE_INDEX` on all backends. By @cwfitzgerald in [#3272](https://github.com/gfx-rs/wgpu/pull/3272)
- Implemented `TextureFormat::Stencil8`, allowing for stencil testing without depth components. By @Dinnerbone in [#3343](https://github.com/gfx-rs/wgpu/pull/3343)
- Implemented `add_srgb_suffix()` for `TextureFormat` for converting linear formats to sRGB. By @Elabajaba in [#3419](https://github.com/gfx-rs/wgpu/pull/3419)
- Implemented `add_srgb_suffix()` for `TextureFormat` for converting linear formats to sRGB. By @Elabajaba in [#3419](https://github.com/gfx-rs/wgpu/pull/3419)
- Zero-initialize workgroup memory. By @teoxoy in [#3174](https://github.com/gfx-rs/wgpu/pull/3174)

#### GLES

Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ path = "./wgpu-hal"

[workspace.dependencies.naga]
git = "https://github.com/gfx-rs/naga"
rev = "1be8024"
rev = "c7d02151f08d6285683795289b5725b827d836d1"
version = "0.10"

[workspace.dependencies]
Expand Down
2 changes: 1 addition & 1 deletion wgpu-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ thiserror = "1"

[dependencies.naga]
git = "https://github.com/gfx-rs/naga"
rev = "1be8024"
rev = "c7d02151f08d6285683795289b5725b827d836d1"
version = "0.10"
features = ["clone", "span", "validate"]

Expand Down
4 changes: 2 additions & 2 deletions wgpu-hal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,14 @@ android_system_properties = "0.1.1"

[dependencies.naga]
git = "https://github.com/gfx-rs/naga"
rev = "1be8024"
rev = "c7d02151f08d6285683795289b5725b827d836d1"
version = "0.10"
features = ["clone"]

# DEV dependencies
[dev-dependencies.naga]
git = "https://github.com/gfx-rs/naga"
rev = "1be8024"
rev = "c7d02151f08d6285683795289b5725b827d836d1"
version = "0.10"
features = ["wgsl-in"]

Expand Down
1 change: 1 addition & 0 deletions wgpu-hal/src/dx12/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,7 @@ impl crate::Device<super::Api> for super::Device {
fake_missing_bindings: false,
special_constants_binding,
push_constants_target,
zero_initialize_workgroup_memory: true,
},
})
}
Expand Down
1 change: 1 addition & 0 deletions wgpu-hal/src/gles/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,7 @@ impl crate::Device<super::Api> for super::Device {
version: self.shared.shading_language_version,
writer_flags,
binding_map,
zero_initialize_workgroup_memory: true,
},
})
}
Expand Down
1 change: 1 addition & 0 deletions wgpu-hal/src/metal/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ impl crate::Device<super::Api> for super::Device {
// TODO: support bounds checks on binding arrays
binding_array: naga::proc::BoundsCheckPolicy::Unchecked,
},
zero_initialize_workgroup_memory: true,
},
total_push_constants,
})
Expand Down
41 changes: 41 additions & 0 deletions wgpu-hal/src/vulkan/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ pub struct PhysicalDeviceFeatures {
vk::PhysicalDeviceShaderFloat16Int8Features,
vk::PhysicalDevice16BitStorageFeatures,
)>,
zero_initialize_workgroup_memory:
Option<vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures>,
}

// This is safe because the structs have `p_next: *mut c_void`, which we null out/never read.
Expand Down Expand Up @@ -69,6 +71,9 @@ impl PhysicalDeviceFeatures {
info = info.push_next(f16_i8_feature);
info = info.push_next(_16bit_feature);
}
if let Some(ref mut feature) = self.zero_initialize_workgroup_memory {
info = info.push_next(feature);
}
info
}

Expand Down Expand Up @@ -286,6 +291,19 @@ impl PhysicalDeviceFeatures {
} else {
None
},
zero_initialize_workgroup_memory: if effective_api_version >= vk::API_VERSION_1_3
|| enabled_extensions.contains(&vk::KhrZeroInitializeWorkgroupMemoryFn::name())
{
Some(
vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures::builder()
.shader_zero_initialize_workgroup_memory(
private_caps.zero_initialize_workgroup_memory,
)
.build(),
)
} else {
None
teoxoy marked this conversation as resolved.
Show resolved Hide resolved
},
}
}

Expand Down Expand Up @@ -876,6 +894,16 @@ impl super::InstanceShared {
builder = builder.push_next(&mut next.1);
}

// `VK_KHR_zero_initialize_workgroup_memory` is promoted to 1.3
if capabilities.effective_api_version >= vk::API_VERSION_1_3
|| capabilities.supports_extension(vk::KhrZeroInitializeWorkgroupMemoryFn::name())
{
let next = features
.zero_initialize_workgroup_memory
.insert(vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures::default());
builder = builder.push_next(next);
}

let mut features2 = builder.build();
unsafe {
get_device_properties.get_physical_device_features2(phd, &mut features2);
Expand Down Expand Up @@ -1035,6 +1063,11 @@ impl super::Instance {
.image_robustness
.map_or(false, |ext| ext.robust_image_access != 0),
},
zero_initialize_workgroup_memory: phd_features
.zero_initialize_workgroup_memory
.map_or(false, |ext| {
ext.shader_zero_initialize_workgroup_memory == vk::TRUE
}),
};
let capabilities = crate::Capabilities {
limits: phd_capabilities.to_wgpu_limits(),
Expand Down Expand Up @@ -1237,6 +1270,14 @@ impl super::Adapter {
// TODO: support bounds checks on binding arrays
binding_array: naga::proc::BoundsCheckPolicy::Unchecked,
},
zero_initialize_workgroup_memory: if self
.private_caps
.zero_initialize_workgroup_memory
{
spv::ZeroInitializeWorkgroupMemoryMode::Native
} else {
spv::ZeroInitializeWorkgroupMemoryMode::Polyfill
},
// We need to build this separately for each invocation, so just default it out here
binding_map: BTreeMap::default(),
}
Expand Down
1 change: 1 addition & 0 deletions wgpu-hal/src/vulkan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ struct PrivateCapabilities {
non_coherent_map_mask: wgt::BufferAddress,
robust_buffer_access: bool,
robust_image_access: bool,
zero_initialize_workgroup_memory: bool,
}

bitflags::bitflags!(
Expand Down
1 change: 1 addition & 0 deletions wgpu/tests/shader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::common::TestingContext;

mod numeric_builtins;
mod struct_layout;
mod zero_init_workgroup_mem;

#[derive(Clone, Copy, PartialEq)]
enum InputStorageType {
Expand Down
183 changes: 183 additions & 0 deletions wgpu/tests/shader/zero_init_workgroup_mem.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
use std::num::NonZeroU64;

use wgpu::{
include_wgsl, Backends, BindGroupDescriptor, BindGroupEntry, BindGroupLayoutDescriptor,
BindGroupLayoutEntry, BindingResource, BindingType, BufferBinding, BufferBindingType,
BufferDescriptor, BufferUsages, CommandEncoderDescriptor, ComputePassDescriptor,
ComputePipelineDescriptor, DownlevelFlags, Limits, Maintain, MapMode, PipelineLayoutDescriptor,
ShaderStages,
};

use crate::common::{initialize_test, TestParameters, TestingContext};

#[test]
fn zero_init_workgroup_mem() {
initialize_test(
TestParameters::default()
.downlevel_flags(DownlevelFlags::COMPUTE_SHADERS)
.limits(Limits::downlevel_defaults())
// remove once we get to https://github.com/gfx-rs/wgpu/issues/3193 or
// https://github.com/gfx-rs/wgpu/issues/3160
.specific_failure(
Some(Backends::DX12),
Some(5140),
Some("Microsoft Basic Render Driver"),
true,
)
// this one is flakey
.specific_failure(
Some(Backends::VULKAN),
Some(6880),
Some("SwiftShader"),
true,
)
// TODO: investigate why it fails
.specific_failure(Some(Backends::GL), Some(65541), Some("llvmpipe"), false),
zero_init_workgroup_mem_impl,
);
}

const DISPATCH_SIZE: (u32, u32, u32) = (64, 64, 64);
const TOTAL_WORK_GROUPS: u32 = DISPATCH_SIZE.0 * DISPATCH_SIZE.1 * DISPATCH_SIZE.2;

/// nr of bytes we use in the shader
const SHADER_WORKGROUP_MEMORY: u32 = 512 * 4 + 4;
// assume we have this much workgroup memory (2GB)
const MAX_DEVICE_WORKGROUP_MEMORY: u32 = i32::MAX as u32;
const NR_OF_DISPATCHES: u32 =
MAX_DEVICE_WORKGROUP_MEMORY / (SHADER_WORKGROUP_MEMORY * TOTAL_WORK_GROUPS) + 1; // TODO: use div_ceil once stabilized

const OUTPUT_ARRAY_SIZE: u32 = TOTAL_WORK_GROUPS * NR_OF_DISPATCHES;
const BUFFER_SIZE: u64 = OUTPUT_ARRAY_SIZE as u64 * 4;
const BUFFER_BINDING_SIZE: u32 = TOTAL_WORK_GROUPS * 4;

fn zero_init_workgroup_mem_impl(ctx: TestingContext) {
let bgl = ctx
.device
.create_bind_group_layout(&BindGroupLayoutDescriptor {
label: None,
entries: &[BindGroupLayoutEntry {
binding: 0,
visibility: ShaderStages::COMPUTE,
ty: BindingType::Buffer {
ty: BufferBindingType::Storage { read_only: false },
has_dynamic_offset: true,
min_binding_size: None,
},
count: None,
}],
});

let output_buffer = ctx.device.create_buffer(&BufferDescriptor {
label: Some("output buffer"),
size: BUFFER_SIZE,
usage: BufferUsages::COPY_DST | BufferUsages::COPY_SRC | BufferUsages::STORAGE,
mapped_at_creation: false,
});

let mapping_buffer = ctx.device.create_buffer(&BufferDescriptor {
label: Some("mapping buffer"),
size: BUFFER_SIZE,
usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ,
mapped_at_creation: false,
});

let bg = ctx.device.create_bind_group(&BindGroupDescriptor {
label: None,
layout: &bgl,
entries: &[BindGroupEntry {
binding: 0,
resource: BindingResource::Buffer(BufferBinding {
buffer: &output_buffer,
offset: 0,
size: Some(NonZeroU64::new(BUFFER_BINDING_SIZE as u64).unwrap()),
}),
}],
});

let pll = ctx
.device
.create_pipeline_layout(&PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[&bgl],
push_constant_ranges: &[],
});

let sm = ctx
.device
.create_shader_module(include_wgsl!("zero_init_workgroup_mem.wgsl"));

let pipeline_read = ctx
.device
.create_compute_pipeline(&ComputePipelineDescriptor {
label: Some("pipeline read"),
layout: Some(&pll),
module: &sm,
entry_point: "read",
});

let pipeline_write = ctx
.device
.create_compute_pipeline(&ComputePipelineDescriptor {
label: Some("pipeline write"),
layout: None,
module: &sm,
entry_point: "write",
});

// -- Initializing data --

let output_pre_init_data = vec![1; OUTPUT_ARRAY_SIZE as usize];
ctx.queue.write_buffer(
&output_buffer,
0,
bytemuck::cast_slice(&output_pre_init_data),
);

// -- Run test --

let mut encoder = ctx
.device
.create_command_encoder(&CommandEncoderDescriptor::default());

let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default());

cpass.set_pipeline(&pipeline_write);
for _ in 0..NR_OF_DISPATCHES {
cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2);
}

cpass.set_pipeline(&pipeline_read);
for i in 0..NR_OF_DISPATCHES {
cpass.set_bind_group(0, &bg, &[i * BUFFER_BINDING_SIZE]);
cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2);
}
drop(cpass);

// -- Pulldown data --

encoder.copy_buffer_to_buffer(&output_buffer, 0, &mapping_buffer, 0, BUFFER_SIZE);

ctx.queue.submit(Some(encoder.finish()));

mapping_buffer.slice(..).map_async(MapMode::Read, |_| ());
ctx.device.poll(Maintain::Wait);

let mapped = mapping_buffer.slice(..).get_mapped_range();

let typed: &[u32] = bytemuck::cast_slice(&*mapped);

// -- Check results --

let num_disptaches_failed = typed.iter().filter(|&&res| res != 0).count();
let ratio = (num_disptaches_failed as f32 / OUTPUT_ARRAY_SIZE as f32) * 100.;

assert!(
num_disptaches_failed == 0,
"Zero-initialization of workgroup memory failed ({:.0}% of disptaches failed).",
ratio
);

drop(mapped);
mapping_buffer.unmap();
}
31 changes: 31 additions & 0 deletions wgpu/tests/shader/zero_init_workgroup_mem.wgsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
const array_size = 512u;

struct WStruct {
arr: array<u32, array_size>,
atom: atomic<u32>
}

var<workgroup> w_mem: WStruct;

@group(0) @binding(0)
var<storage, read_write> output: array<u32>;

@compute @workgroup_size(1)
fn read(@builtin(workgroup_id) wgid: vec3<u32>, @builtin(num_workgroups) num_workgroups: vec3<u32>) {
var is_zero = true;
for(var i = 0u; i < array_size; i++) {
is_zero &= w_mem.arr[i] == 0u;
}
is_zero &= atomicLoad(&w_mem.atom) == 0u;

let idx = wgid.x + (wgid.y * num_workgroups.x) + (wgid.z * num_workgroups.x * num_workgroups.y);
output[idx] = u32(!is_zero);
}

@compute @workgroup_size(1)
fn write() {
for(var i = 0u; i < array_size; i++) {
w_mem.arr[i] = i;
}
atomicStore(&w_mem.atom, 3u);
}