Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wgpu-core/-hal] move raytracing alignments into hal #6563

Merged
merged 5 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ By @ErichDonGubler in [#6456](https://github.com/gfx-rs/wgpu/pull/6456), [#6148]
#### General

- Return submission index in `map_async` and `on_submitted_work_done` to track down completion of async callbacks. By @eliemichel in [#6360](https://github.com/gfx-rs/wgpu/pull/6360).
- Move raytracing alignments into HAL instead of in core. By @Vecvec in [#6563](https://github.com/gfx-rs/wgpu/pull/6563).

### Changes

Expand Down
25 changes: 14 additions & 11 deletions wgpu-core/src/command/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
id::CommandEncoderId,
init_tracker::MemoryInitKind,
ray_tracing::{
tlas_instance_into_bytes, BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry,
BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry,
BuildAccelerationStructureError, TlasAction, TlasBuildEntry, TlasInstance, TlasPackage,
TraceBlasBuildEntry, TraceBlasGeometries, TraceBlasTriangleGeometry, TraceTlasInstance,
TraceTlasPackage, ValidateBlasActionsError, ValidateTlasActionsError,
Expand Down Expand Up @@ -60,9 +60,6 @@ struct TlasBufferStore {
entry: TlasBuildEntry,
}

// TODO: Get this from the device (e.g. VkPhysicalDeviceAccelerationStructurePropertiesKHR.minAccelerationStructureScratchOffsetAlignment) this is currently the largest possible some devices have 0, 64, 128 (lower limits) so this could create excess allocation (Note: dx12 has 256).
const SCRATCH_BUFFER_ALIGNMENT: u32 = 256;

impl Global {
// Currently this function is very similar to its safe counterpart, however certain parts of it are very different,
// making for the two to be implemented differently, the main difference is this function has separate buffers for each
Expand Down Expand Up @@ -193,6 +190,7 @@ impl Global {
&mut scratch_buffer_blas_size,
&mut blas_storage,
hub,
device.alignments.ray_tracing_scratch_buffer_alignment,
)?;

let mut scratch_buffer_tlas_size = 0;
Expand Down Expand Up @@ -260,7 +258,7 @@ impl Global {
let scratch_buffer_offset = scratch_buffer_tlas_size;
scratch_buffer_tlas_size += align_to(
tlas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
device.alignments.ray_tracing_scratch_buffer_alignment,
) as u64;

tlas_storage.push(UnsafeTlasStore {
Expand Down Expand Up @@ -508,6 +506,7 @@ impl Global {
&mut scratch_buffer_blas_size,
&mut blas_storage,
hub,
device.alignments.ray_tracing_scratch_buffer_alignment,
)?;
let mut tlas_lock_store = Vec::<(Option<TlasPackage>, Arc<Tlas>)>::new();

Expand Down Expand Up @@ -535,7 +534,7 @@ impl Global {
let scratch_buffer_offset = scratch_buffer_tlas_size;
scratch_buffer_tlas_size += align_to(
tlas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
device.alignments.ray_tracing_scratch_buffer_alignment,
) as u64;

let first_byte_index = instance_buffer_staging_source.len();
Expand All @@ -558,10 +557,13 @@ impl Global {

cmd_buf_data.trackers.blas_s.set_single(blas.clone());

instance_buffer_staging_source.extend(tlas_instance_into_bytes(
&instance,
blas.handle,
device.backend(),
instance_buffer_staging_source.extend(device.raw().tlas_instance_to_bytes(
hal::TlasInstance {
transform: *instance.transform,
custom_index: instance.custom_index,
mask: instance.mask,
blas_address: blas.handle,
},
));

instance_count += 1;
Expand Down Expand Up @@ -1013,6 +1015,7 @@ fn iter_buffers<'a, 'b>(
scratch_buffer_blas_size: &mut u64,
blas_storage: &mut Vec<BlasStore<'a>>,
hub: &Hub,
ray_tracing_scratch_buffer_alignment: u32,
) -> Result<(), BuildAccelerationStructureError> {
let mut triangle_entries =
Vec::<hal::AccelerationStructureTriangles<dyn hal::DynBuffer>>::new();
Expand Down Expand Up @@ -1192,7 +1195,7 @@ fn iter_buffers<'a, 'b>(
let scratch_buffer_offset = *scratch_buffer_blas_size;
*scratch_buffer_blas_size += align_to(
blas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
ray_tracing_scratch_buffer_alignment,
) as u64;

blas_storage.push(BlasStore {
Expand Down
4 changes: 2 additions & 2 deletions wgpu-core/src/device/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
global::Global,
id::{self, BlasId, TlasId},
lock::RwLock,
ray_tracing::{get_raw_tlas_instance_size, CreateBlasError, CreateTlasError},
ray_tracing::{CreateBlasError, CreateTlasError},
resource, LabelHelpers,
};
use hal::AccelerationStructureTriangleIndices;
Expand Down Expand Up @@ -135,7 +135,7 @@ impl Device {
.map_err(DeviceError::from_hal)?;

let instance_buffer_size =
get_raw_tlas_instance_size(self.backend()) * desc.max_instances.max(1) as usize;
self.alignments.raw_tlas_instance_size * desc.max_instances.max(1) as usize;
let instance_buffer = unsafe {
self.raw().create_buffer(&hal::BufferDescriptor {
label: Some("(wgpu-core) instances_buffer"),
Expand Down
49 changes: 2 additions & 47 deletions wgpu-core/src/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ use crate::{
id::{BlasId, BufferId, TlasId},
resource::CreateBufferError,
};
use std::{mem::size_of, sync::Arc};
use std::{num::NonZeroU64, slice};
use std::num::NonZeroU64;
use std::sync::Arc;

use crate::resource::{Blas, ResourceErrorIdent, Tlas};
use thiserror::Error;
Expand Down Expand Up @@ -276,48 +276,3 @@ pub struct TraceTlasPackage {
pub instances: Vec<Option<TraceTlasInstance>>,
pub lowest_unmodified: u32,
}

pub(crate) fn get_raw_tlas_instance_size(backend: wgt::Backend) -> usize {
// TODO: this should be provided by the backend
match backend {
wgt::Backend::Empty => 0,
wgt::Backend::Vulkan => 64,
_ => unimplemented!(),
}
}

#[derive(Clone)]
#[repr(C)]
struct RawTlasInstance {
transform: [f32; 12],
custom_index_and_mask: u32,
shader_binding_table_record_offset_and_flags: u32,
acceleration_structure_reference: u64,
}

pub(crate) fn tlas_instance_into_bytes(
instance: &TlasInstance,
blas_address: u64,
backend: wgt::Backend,
) -> Vec<u8> {
// TODO: get the device to do this
match backend {
wgt::Backend::Empty => vec![],
wgt::Backend::Vulkan => {
const MAX_U24: u32 = (1u32 << 24u32) - 1u32;
let temp = RawTlasInstance {
transform: *instance.transform,
custom_index_and_mask: (instance.custom_index & MAX_U24)
| (u32::from(instance.mask) << 24),
shader_binding_table_record_offset_and_flags: 0,
acceleration_structure_reference: blas_address,
};
let temp: *const _ = &temp;
unsafe {
slice::from_raw_parts::<u8>(temp.cast::<u8>(), size_of::<RawTlasInstance>())
.to_vec()
}
}
_ => unimplemented!(),
}
}
2 changes: 2 additions & 0 deletions wgpu-hal/src/dx12/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,8 @@ impl super::Adapter {
// Direct3D correctly bounds-checks all array accesses:
// https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#18.6.8.2%20Device%20Memory%20Reads
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
downlevel,
},
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/dx12/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use super::{conv, descriptor, D3D12Lib};
use crate::{
auxil::{self, dxgi::result::HResult},
dx12::{borrow_optional_interface_temporarily, shader_compilation, Event},
TlasInstance,
};

// this has to match Naga's HLSL backend, and also needs to be null-terminated
Expand Down Expand Up @@ -1939,4 +1940,8 @@ impl crate::Device for super::Device {
total_reserved_bytes: upstream.total_reserved_bytes,
})
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
todo!()
}
}
7 changes: 6 additions & 1 deletion wgpu-hal/src/dynamic/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
GetAccelerationStructureBuildSizesDescriptor, Label, MemoryRange, PipelineCacheDescriptor,
PipelineCacheError, PipelineError, PipelineLayoutDescriptor, RenderPipelineDescriptor,
SamplerDescriptor, ShaderError, ShaderInput, ShaderModuleDescriptor, TextureDescriptor,
TextureViewDescriptor,
TextureViewDescriptor, TlasInstance,
};

use super::{
Expand Down Expand Up @@ -158,6 +158,7 @@ pub trait DynDevice: DynResource {
&self,
acceleration_structure: Box<dyn DynAccelerationStructure>,
);
fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8>;

fn get_internal_counters(&self) -> wgt::HalCounters;
fn generate_allocator_report(&self) -> Option<wgt::AllocatorReport>;
Expand Down Expand Up @@ -520,6 +521,10 @@ impl<D: Device + DynResource> DynDevice for D {
unsafe { D::destroy_acceleration_structure(self, acceleration_structure.unbox()) }
}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
D::tlas_instance_to_bytes(self, instance)
}

fn get_internal_counters(&self) -> wgt::HalCounters {
D::get_internal_counters(self)
}
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/empty.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#![allow(unused_variables)]

use crate::TlasInstance;
use std::ops::Range;

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -306,6 +307,10 @@ impl crate::Device for Context {
}
unsafe fn destroy_acceleration_structure(&self, _acceleration_structure: Resource) {}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
vec![]
}

fn get_internal_counters(&self) -> wgt::HalCounters {
Default::default()
}
Expand Down
2 changes: 2 additions & 0 deletions wgpu-hal/src/gles/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,8 @@ impl super::Adapter {
// being, provide 1 as the value here, to cause as little
// trouble as possible.
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
},
})
Expand Down
6 changes: 5 additions & 1 deletion wgpu-hal/src/gles/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::{
sync::{Arc, Mutex},
};

use crate::AtomicFenceValue;
use crate::{AtomicFenceValue, TlasInstance};
use arrayvec::ArrayVec;
use std::sync::atomic::Ordering;

Expand Down Expand Up @@ -1633,6 +1633,10 @@ impl crate::Device for super::Device {
) {
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
unimplemented!()
}

fn get_internal_counters(&self) -> wgt::HalCounters {
self.counters.clone()
}
Expand Down
15 changes: 15 additions & 0 deletions wgpu-hal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,7 @@ pub trait Device: WasmNotSendSync {
&self,
acceleration_structure: <Self::A as Api>::AccelerationStructure,
);
fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8>;

fn get_internal_counters(&self) -> wgt::HalCounters;

Expand Down Expand Up @@ -1771,6 +1772,12 @@ pub struct Alignments {
/// [`Uniform`]: wgt::BufferBindingType::Uniform
/// [size]: BufferBinding::size
pub uniform_bounds_check_alignment: wgt::BufferSize,

/// The size of the raw TLAS instance
pub raw_tlas_instance_size: usize,

/// What the scratch buffer for building an acceleration structure must be aligned to
pub ray_tracing_scratch_buffer_alignment: u32,
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -2519,3 +2526,11 @@ bitflags::bitflags! {
pub struct AccelerationStructureBarrier {
pub usage: Range<AccelerationStructureUses>,
}

#[derive(Debug, Copy, Clone)]
pub struct TlasInstance {
pub transform: [f32; 12],
pub custom_index: u32,
pub mask: u8,
pub blas_address: u64,
}
2 changes: 2 additions & 0 deletions wgpu-hal/src/metal/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,8 @@ impl super::PrivateCapabilities {
// Metal Shading Language it generates, so from `wgpu_hal`'s
// users' point of view, references are tightly checked.
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
downlevel,
}
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/metal/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::{

use super::conv;
use crate::auxil::map_naga_stage;
use crate::TlasInstance;

type DeviceResult<T> = Result<T, crate::DeviceError>;

Expand Down Expand Up @@ -1426,6 +1427,10 @@ impl crate::Device for super::Device {
unimplemented!()
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
unimplemented!()
}

fn get_internal_counters(&self) -> wgt::HalCounters {
self.counters.clone()
}
Expand Down
7 changes: 7 additions & 0 deletions wgpu-hal/src/vulkan/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,13 @@ impl PhysicalDeviceProperties {
};
wgt::BufferSize::new(alignment).unwrap()
},
raw_tlas_instance_size: 64,
ray_tracing_scratch_buffer_alignment: self.acceleration_structure.map_or(
0,
|acceleration_structure| {
acceleration_structure.min_acceleration_structure_scratch_offset_alignment
},
),
}
}
}
Expand Down
22 changes: 20 additions & 2 deletions wgpu-hal/src/vulkan/device.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
use super::conv;
use super::{conv, RawTlasInstance};

use arrayvec::ArrayVec;
use ash::{khr, vk};
use parking_lot::Mutex;

use crate::TlasInstance;
use std::{
borrow::Cow,
collections::{hash_map::Entry, BTreeMap},
ffi::{CStr, CString},
mem,
mem::MaybeUninit,
num::NonZeroU32,
ptr,
ptr, slice,
sync::Arc,
};

Expand Down Expand Up @@ -2557,6 +2559,22 @@ impl crate::Device for super::Device {

self.counters.clone()
}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
const MAX_U24: u32 = (1u32 << 24u32) - 1u32;
let temp = RawTlasInstance {
transform: instance.transform,
custom_index_and_mask: (instance.custom_index & MAX_U24)
| (u32::from(instance.mask) << 24),
shader_binding_table_record_offset_and_flags: 0,
acceleration_structure_reference: instance.blas_address,
};
let temp: *const _ = &temp;
unsafe {
slice::from_raw_parts::<u8>(temp.cast::<u8>(), mem::size_of::<RawTlasInstance>())
.to_vec()
}
}
}

impl super::DeviceShared {
Expand Down
Loading