Skip to content

Commit

Permalink
[wgpu-core/-hal] move raytracing alignments into hal (#6563)
Browse files Browse the repository at this point in the history
  • Loading branch information
Vecvec authored Nov 19, 2024
1 parent 2389106 commit 6f5014f
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 64 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ By @ErichDonGubler in [#6456](https://github.com/gfx-rs/wgpu/pull/6456), [#6148]
#### General

- Return submission index in `map_async` and `on_submitted_work_done` to track down completion of async callbacks. By @eliemichel in [#6360](https://github.com/gfx-rs/wgpu/pull/6360).
- Move raytracing alignments into HAL instead of in core. By @Vecvec in [#6563](https://github.com/gfx-rs/wgpu/pull/6563).

### Changes

Expand Down
25 changes: 14 additions & 11 deletions wgpu-core/src/command/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
id::CommandEncoderId,
init_tracker::MemoryInitKind,
ray_tracing::{
tlas_instance_into_bytes, BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry,
BlasAction, BlasBuildEntry, BlasGeometries, BlasTriangleGeometry,
BuildAccelerationStructureError, TlasAction, TlasBuildEntry, TlasInstance, TlasPackage,
TraceBlasBuildEntry, TraceBlasGeometries, TraceBlasTriangleGeometry, TraceTlasInstance,
TraceTlasPackage, ValidateBlasActionsError, ValidateTlasActionsError,
Expand Down Expand Up @@ -60,9 +60,6 @@ struct TlasBufferStore {
entry: TlasBuildEntry,
}

// TODO: Get this from the device (e.g. VkPhysicalDeviceAccelerationStructurePropertiesKHR.minAccelerationStructureScratchOffsetAlignment) this is currently the largest possible some devices have 0, 64, 128 (lower limits) so this could create excess allocation (Note: dx12 has 256).
const SCRATCH_BUFFER_ALIGNMENT: u32 = 256;

impl Global {
// Currently this function is very similar to its safe counterpart, however certain parts of it are very different,
// making for the two to be implemented differently, the main difference is this function has separate buffers for each
Expand Down Expand Up @@ -193,6 +190,7 @@ impl Global {
&mut scratch_buffer_blas_size,
&mut blas_storage,
hub,
device.alignments.ray_tracing_scratch_buffer_alignment,
)?;

let mut scratch_buffer_tlas_size = 0;
Expand Down Expand Up @@ -260,7 +258,7 @@ impl Global {
let scratch_buffer_offset = scratch_buffer_tlas_size;
scratch_buffer_tlas_size += align_to(
tlas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
device.alignments.ray_tracing_scratch_buffer_alignment,
) as u64;

tlas_storage.push(UnsafeTlasStore {
Expand Down Expand Up @@ -508,6 +506,7 @@ impl Global {
&mut scratch_buffer_blas_size,
&mut blas_storage,
hub,
device.alignments.ray_tracing_scratch_buffer_alignment,
)?;
let mut tlas_lock_store = Vec::<(Option<TlasPackage>, Arc<Tlas>)>::new();

Expand Down Expand Up @@ -535,7 +534,7 @@ impl Global {
let scratch_buffer_offset = scratch_buffer_tlas_size;
scratch_buffer_tlas_size += align_to(
tlas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
device.alignments.ray_tracing_scratch_buffer_alignment,
) as u64;

let first_byte_index = instance_buffer_staging_source.len();
Expand All @@ -558,10 +557,13 @@ impl Global {

cmd_buf_data.trackers.blas_s.set_single(blas.clone());

instance_buffer_staging_source.extend(tlas_instance_into_bytes(
&instance,
blas.handle,
device.backend(),
instance_buffer_staging_source.extend(device.raw().tlas_instance_to_bytes(
hal::TlasInstance {
transform: *instance.transform,
custom_index: instance.custom_index,
mask: instance.mask,
blas_address: blas.handle,
},
));

instance_count += 1;
Expand Down Expand Up @@ -1013,6 +1015,7 @@ fn iter_buffers<'a, 'b>(
scratch_buffer_blas_size: &mut u64,
blas_storage: &mut Vec<BlasStore<'a>>,
hub: &Hub,
ray_tracing_scratch_buffer_alignment: u32,
) -> Result<(), BuildAccelerationStructureError> {
let mut triangle_entries =
Vec::<hal::AccelerationStructureTriangles<dyn hal::DynBuffer>>::new();
Expand Down Expand Up @@ -1192,7 +1195,7 @@ fn iter_buffers<'a, 'b>(
let scratch_buffer_offset = *scratch_buffer_blas_size;
*scratch_buffer_blas_size += align_to(
blas.size_info.build_scratch_size as u32,
SCRATCH_BUFFER_ALIGNMENT,
ray_tracing_scratch_buffer_alignment,
) as u64;

blas_storage.push(BlasStore {
Expand Down
4 changes: 2 additions & 2 deletions wgpu-core/src/device/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
global::Global,
id::{self, BlasId, TlasId},
lock::RwLock,
ray_tracing::{get_raw_tlas_instance_size, CreateBlasError, CreateTlasError},
ray_tracing::{CreateBlasError, CreateTlasError},
resource, LabelHelpers,
};
use hal::AccelerationStructureTriangleIndices;
Expand Down Expand Up @@ -135,7 +135,7 @@ impl Device {
.map_err(DeviceError::from_hal)?;

let instance_buffer_size =
get_raw_tlas_instance_size(self.backend()) * desc.max_instances.max(1) as usize;
self.alignments.raw_tlas_instance_size * desc.max_instances.max(1) as usize;
let instance_buffer = unsafe {
self.raw().create_buffer(&hal::BufferDescriptor {
label: Some("(wgpu-core) instances_buffer"),
Expand Down
49 changes: 2 additions & 47 deletions wgpu-core/src/ray_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ use crate::{
id::{BlasId, BufferId, TlasId},
resource::CreateBufferError,
};
use std::{mem::size_of, sync::Arc};
use std::{num::NonZeroU64, slice};
use std::num::NonZeroU64;
use std::sync::Arc;

use crate::resource::{Blas, ResourceErrorIdent, Tlas};
use thiserror::Error;
Expand Down Expand Up @@ -276,48 +276,3 @@ pub struct TraceTlasPackage {
pub instances: Vec<Option<TraceTlasInstance>>,
pub lowest_unmodified: u32,
}

pub(crate) fn get_raw_tlas_instance_size(backend: wgt::Backend) -> usize {
// TODO: this should be provided by the backend
match backend {
wgt::Backend::Empty => 0,
wgt::Backend::Vulkan => 64,
_ => unimplemented!(),
}
}

#[derive(Clone)]
#[repr(C)]
struct RawTlasInstance {
transform: [f32; 12],
custom_index_and_mask: u32,
shader_binding_table_record_offset_and_flags: u32,
acceleration_structure_reference: u64,
}

pub(crate) fn tlas_instance_into_bytes(
instance: &TlasInstance,
blas_address: u64,
backend: wgt::Backend,
) -> Vec<u8> {
// TODO: get the device to do this
match backend {
wgt::Backend::Empty => vec![],
wgt::Backend::Vulkan => {
const MAX_U24: u32 = (1u32 << 24u32) - 1u32;
let temp = RawTlasInstance {
transform: *instance.transform,
custom_index_and_mask: (instance.custom_index & MAX_U24)
| (u32::from(instance.mask) << 24),
shader_binding_table_record_offset_and_flags: 0,
acceleration_structure_reference: blas_address,
};
let temp: *const _ = &temp;
unsafe {
slice::from_raw_parts::<u8>(temp.cast::<u8>(), size_of::<RawTlasInstance>())
.to_vec()
}
}
_ => unimplemented!(),
}
}
2 changes: 2 additions & 0 deletions wgpu-hal/src/dx12/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,8 @@ impl super::Adapter {
// Direct3D correctly bounds-checks all array accesses:
// https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#18.6.8.2%20Device%20Memory%20Reads
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
downlevel,
},
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/dx12/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use super::{conv, descriptor, D3D12Lib};
use crate::{
auxil::{self, dxgi::result::HResult},
dx12::{borrow_optional_interface_temporarily, shader_compilation, Event},
TlasInstance,
};

// this has to match Naga's HLSL backend, and also needs to be null-terminated
Expand Down Expand Up @@ -1939,4 +1940,8 @@ impl crate::Device for super::Device {
total_reserved_bytes: upstream.total_reserved_bytes,
})
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
todo!()
}
}
7 changes: 6 additions & 1 deletion wgpu-hal/src/dynamic/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
GetAccelerationStructureBuildSizesDescriptor, Label, MemoryRange, PipelineCacheDescriptor,
PipelineCacheError, PipelineError, PipelineLayoutDescriptor, RenderPipelineDescriptor,
SamplerDescriptor, ShaderError, ShaderInput, ShaderModuleDescriptor, TextureDescriptor,
TextureViewDescriptor,
TextureViewDescriptor, TlasInstance,
};

use super::{
Expand Down Expand Up @@ -158,6 +158,7 @@ pub trait DynDevice: DynResource {
&self,
acceleration_structure: Box<dyn DynAccelerationStructure>,
);
fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8>;

fn get_internal_counters(&self) -> wgt::HalCounters;
fn generate_allocator_report(&self) -> Option<wgt::AllocatorReport>;
Expand Down Expand Up @@ -520,6 +521,10 @@ impl<D: Device + DynResource> DynDevice for D {
unsafe { D::destroy_acceleration_structure(self, acceleration_structure.unbox()) }
}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
D::tlas_instance_to_bytes(self, instance)
}

fn get_internal_counters(&self) -> wgt::HalCounters {
D::get_internal_counters(self)
}
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/empty.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#![allow(unused_variables)]

use crate::TlasInstance;
use std::ops::Range;

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -306,6 +307,10 @@ impl crate::Device for Context {
}
unsafe fn destroy_acceleration_structure(&self, _acceleration_structure: Resource) {}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
vec![]
}

fn get_internal_counters(&self) -> wgt::HalCounters {
Default::default()
}
Expand Down
2 changes: 2 additions & 0 deletions wgpu-hal/src/gles/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,8 @@ impl super::Adapter {
// being, provide 1 as the value here, to cause as little
// trouble as possible.
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
},
})
Expand Down
6 changes: 5 additions & 1 deletion wgpu-hal/src/gles/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::{
sync::{Arc, Mutex},
};

use crate::AtomicFenceValue;
use crate::{AtomicFenceValue, TlasInstance};
use arrayvec::ArrayVec;
use std::sync::atomic::Ordering;

Expand Down Expand Up @@ -1633,6 +1633,10 @@ impl crate::Device for super::Device {
) {
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
unimplemented!()
}

fn get_internal_counters(&self) -> wgt::HalCounters {
self.counters.clone()
}
Expand Down
15 changes: 15 additions & 0 deletions wgpu-hal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,7 @@ pub trait Device: WasmNotSendSync {
&self,
acceleration_structure: <Self::A as Api>::AccelerationStructure,
);
fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8>;

fn get_internal_counters(&self) -> wgt::HalCounters;

Expand Down Expand Up @@ -1771,6 +1772,12 @@ pub struct Alignments {
/// [`Uniform`]: wgt::BufferBindingType::Uniform
/// [size]: BufferBinding::size
pub uniform_bounds_check_alignment: wgt::BufferSize,

/// The size of the raw TLAS instance
pub raw_tlas_instance_size: usize,

/// What the scratch buffer for building an acceleration structure must be aligned to
pub ray_tracing_scratch_buffer_alignment: u32,
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -2519,3 +2526,11 @@ bitflags::bitflags! {
pub struct AccelerationStructureBarrier {
pub usage: Range<AccelerationStructureUses>,
}

#[derive(Debug, Copy, Clone)]
pub struct TlasInstance {
pub transform: [f32; 12],
pub custom_index: u32,
pub mask: u8,
pub blas_address: u64,
}
2 changes: 2 additions & 0 deletions wgpu-hal/src/metal/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,8 @@ impl super::PrivateCapabilities {
// Metal Shading Language it generates, so from `wgpu_hal`'s
// users' point of view, references are tightly checked.
uniform_bounds_check_alignment: wgt::BufferSize::new(1).unwrap(),
raw_tlas_instance_size: 0,
ray_tracing_scratch_buffer_alignment: 0,
},
downlevel,
}
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/metal/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::{

use super::conv;
use crate::auxil::map_naga_stage;
use crate::TlasInstance;

type DeviceResult<T> = Result<T, crate::DeviceError>;

Expand Down Expand Up @@ -1426,6 +1427,10 @@ impl crate::Device for super::Device {
unimplemented!()
}

fn tlas_instance_to_bytes(&self, _instance: TlasInstance) -> Vec<u8> {
unimplemented!()
}

fn get_internal_counters(&self) -> wgt::HalCounters {
self.counters.clone()
}
Expand Down
7 changes: 7 additions & 0 deletions wgpu-hal/src/vulkan/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,13 @@ impl PhysicalDeviceProperties {
};
wgt::BufferSize::new(alignment).unwrap()
},
raw_tlas_instance_size: 64,
ray_tracing_scratch_buffer_alignment: self.acceleration_structure.map_or(
0,
|acceleration_structure| {
acceleration_structure.min_acceleration_structure_scratch_offset_alignment
},
),
}
}
}
Expand Down
22 changes: 20 additions & 2 deletions wgpu-hal/src/vulkan/device.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
use super::conv;
use super::{conv, RawTlasInstance};

use arrayvec::ArrayVec;
use ash::{khr, vk};
use parking_lot::Mutex;

use crate::TlasInstance;
use std::{
borrow::Cow,
collections::{hash_map::Entry, BTreeMap},
ffi::{CStr, CString},
mem,
mem::MaybeUninit,
num::NonZeroU32,
ptr,
ptr, slice,
sync::Arc,
};

Expand Down Expand Up @@ -2557,6 +2559,22 @@ impl crate::Device for super::Device {

self.counters.clone()
}

fn tlas_instance_to_bytes(&self, instance: TlasInstance) -> Vec<u8> {
const MAX_U24: u32 = (1u32 << 24u32) - 1u32;
let temp = RawTlasInstance {
transform: instance.transform,
custom_index_and_mask: (instance.custom_index & MAX_U24)
| (u32::from(instance.mask) << 24),
shader_binding_table_record_offset_and_flags: 0,
acceleration_structure_reference: instance.blas_address,
};
let temp: *const _ = &temp;
unsafe {
slice::from_raw_parts::<u8>(temp.cast::<u8>(), mem::size_of::<RawTlasInstance>())
.to_vec()
}
}
}

impl super::DeviceShared {
Expand Down
Loading

0 comments on commit 6f5014f

Please sign in to comment.