Skip to content

Commit

Permalink
Pack multiple vertex and index arrays together into growable buffers.
Browse files Browse the repository at this point in the history
This commit uses the [`offset-allocator`] crate to combine vertex and
index arrays from different meshes into single buffers. Since the
primary source of `wgpu` overhead is from validation and synchronization
when switching buffers, this significantly improves Bevy's rendering
performance on many scenes.

This patch is a more flexible version of bevyengine#13218, which also used slabs.
Unlike bevyengine#13218, which used slabs of a fixed size, this commit implements
slabs that start small and can grow. In addition to reducing memory
usage, supporting slab growth reduces the number of vertex and index
buffer switches that need to happen during rendering, leading to
improved performance. To prevent pathological fragmentation behavior,
slabs are capped to a maximum size, and mesh arrays that are too large
get their own dedicated slabs.

As an additional improvement over bevyengine#13218, this commit allows the
application to customize all allocator heuristics. The
`MeshAllocatorSettings` resource contains values that adjust the minimum
and maximum slab sizes, the cutoff point at which meshes get their own
dedicated slabs, and the rate at which slabs grow. Hopefully-sensible
defaults have been chosen for each value.

Unfortunately, WebGL 2 doesn't support the *base vertex* feature, which
is necessary to pack vertex arrays from different meshes into the same
buffer. `wgpu` represents this restriction as the downlevel flag
`BASE_VERTEX`. This patch detects that bit and ensures that all vertex
buffers get dedicated slabs on that platform. Even on WebGL 2, though,
we can combine all *index* arrays into single buffers to reduce buffer
changes, and we do so.
  • Loading branch information
pcwalton committed Jul 9, 2024
1 parent 5ffdc0c commit 6403a60
Show file tree
Hide file tree
Showing 10 changed files with 1,150 additions and 71 deletions.
67 changes: 49 additions & 18 deletions crates/bevy_pbr/src/render/mesh.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::mem;

use allocator::MeshAllocator;
use bevy_asset::{load_internal_asset, AssetId};
use bevy_core_pipeline::{
core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT},
Expand Down Expand Up @@ -1210,6 +1211,7 @@ impl GetBatchData for MeshPipeline {
SRes<RenderMeshInstances>,
SRes<RenderLightmaps>,
SRes<RenderAssets<GpuMesh>>,
SRes<MeshAllocator>,
);
// The material bind group ID, the mesh ID, and the lightmap ID,
// respectively.
Expand All @@ -1218,7 +1220,7 @@ impl GetBatchData for MeshPipeline {
type BufferData = MeshUniform;

fn get_batch_data(
(mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
(mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
Expand Down Expand Up @@ -1249,7 +1251,7 @@ impl GetFullBatchData for MeshPipeline {
type BufferInputData = MeshInputUniform;

fn get_index_and_compare_data(
(mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
(mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
// This should only be called during GPU building.
Expand All @@ -1275,7 +1277,7 @@ impl GetFullBatchData for MeshPipeline {
}

fn get_binned_batch_data(
(mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
(mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<Self::BufferData> {
let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
Expand All @@ -1294,7 +1296,7 @@ impl GetFullBatchData for MeshPipeline {
}

fn get_binned_index(
(mesh_instances, _, _): &SystemParamItem<Self::Param>,
(mesh_instances, _, _, _): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<NonMaxU32> {
// This should only be called during GPU building.
Expand All @@ -1312,14 +1314,15 @@ impl GetFullBatchData for MeshPipeline {
}

fn get_batch_indirect_parameters_index(
(mesh_instances, _, meshes): &SystemParamItem<Self::Param>,
(mesh_instances, _, meshes, mesh_allocator): &SystemParamItem<Self::Param>,
indirect_parameters_buffer: &mut IndirectParametersBuffer,
entity: Entity,
instance_index: u32,
) -> Option<NonMaxU32> {
get_batch_indirect_parameters_index(
mesh_instances,
meshes,
mesh_allocator,
indirect_parameters_buffer,
entity,
instance_index,
Expand All @@ -1333,6 +1336,7 @@ impl GetFullBatchData for MeshPipeline {
fn get_batch_indirect_parameters_index(
mesh_instances: &RenderMeshInstances,
meshes: &RenderAssets<GpuMesh>,
mesh_allocator: &MeshAllocator,
indirect_parameters_buffer: &mut IndirectParametersBuffer,
entity: Entity,
instance_index: u32,
Expand All @@ -1348,24 +1352,29 @@ fn get_batch_indirect_parameters_index(

let mesh_instance = mesh_instances.get(&entity)?;
let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
let vertex_buffer_slice = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)?;

// Note that `IndirectParameters` covers both of these structures, even
// though they actually have distinct layouts. See the comment above that
// type for more information.
let indirect_parameters = match mesh.buffer_info {
GpuBufferInfo::Indexed {
count: index_count, ..
} => IndirectParameters {
vertex_or_index_count: index_count,
instance_count: 0,
first_vertex: 0,
base_vertex_or_first_instance: 0,
first_instance: instance_index,
},
} => {
let index_buffer_slice =
mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)?;
IndirectParameters {
vertex_or_index_count: index_count,
instance_count: 0,
first_vertex_or_first_index: index_buffer_slice.range.start,
base_vertex_or_first_instance: vertex_buffer_slice.range.start,
first_instance: instance_index,
}
}
GpuBufferInfo::NonIndexed => IndirectParameters {
vertex_or_index_count: mesh.vertex_count,
instance_count: 0,
first_vertex: 0,
first_vertex_or_first_index: vertex_buffer_slice.range.start,
base_vertex_or_first_instance: instance_index,
first_instance: instance_index,
},
Expand Down Expand Up @@ -2242,6 +2251,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
SRes<RenderMeshInstances>,
SRes<IndirectParametersBuffer>,
SRes<PipelineCache>,
SRes<MeshAllocator>,
Option<SRes<PreprocessPipelines>>,
);
type ViewQuery = Has<PreprocessBindGroup>;
Expand All @@ -2251,7 +2261,14 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
item: &P,
has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
_item_query: Option<()>,
(meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
(
meshes,
mesh_instances,
indirect_parameters_buffer,
pipeline_cache,
mesh_allocator,
preprocess_pipelines,
): SystemParamItem<'w, '_, Self::Param>,
pass: &mut TrackedRenderPass<'w>,
) -> RenderCommandResult {
// If we're using GPU preprocessing, then we're dependent on that
Expand All @@ -2268,13 +2285,17 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
let meshes = meshes.into_inner();
let mesh_instances = mesh_instances.into_inner();
let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
let mesh_allocator = mesh_allocator.into_inner();

let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
return RenderCommandResult::Failure;
};
let Some(gpu_mesh) = meshes.get(mesh_asset_id) else {
return RenderCommandResult::Failure;
};
let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(&mesh_asset_id) else {
return RenderCommandResult::Failure;
};

// Calculate the indirect offset, and look up the buffer.
let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
Expand All @@ -2291,21 +2312,31 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
},
};

pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..));

let batch_range = item.batch_range();

// Draw either directly or indirectly, as appropriate.
match &gpu_mesh.buffer_info {
GpuBufferInfo::Indexed {
buffer,
index_format,
count,
} => {
pass.set_index_buffer(buffer.slice(..), 0, *index_format);
let Some(index_buffer_slice) = mesh_allocator.mesh_index_slice(&mesh_asset_id)
else {
return RenderCommandResult::Failure;
};

pass.set_index_buffer(index_buffer_slice.buffer.slice(..), 0, *index_format);

match indirect_parameters {
None => {
pass.draw_indexed(0..*count, 0, batch_range.clone());
pass.draw_indexed(
index_buffer_slice.range.start
..(index_buffer_slice.range.start + *count),
vertex_buffer_slice.range.start as i32,
batch_range.clone(),
);
}
Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
.draw_indexed_indirect(
Expand Down
1 change: 1 addition & 0 deletions crates/bevy_render/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ profiling = { version = "1", features = [
async-channel = "2.2.0"
nonmax = "0.5"
smallvec = { version = "1.11", features = ["const_new"] }
offset-allocator = "0.2"

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
# Omit the `glsl` feature in non-WebAssembly by default.
Expand Down
2 changes: 1 addition & 1 deletion crates/bevy_render/src/batching/gpu_preprocessing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ pub struct IndirectParameters {
pub instance_count: u32,

/// The index of the first vertex we're to draw.
pub first_vertex: u32,
pub first_vertex_or_first_index: u32,

/// For `ArrayIndirectParameters`, `first_instance`; for
/// `ElementIndirectParameters`, `base_vertex`.
Expand Down
Loading

0 comments on commit 6403a60

Please sign in to comment.