Pack multiple vertex and index arrays together into growable buffers.

This commit uses the [`offset-allocator`] crate to combine vertex and index arrays from different meshes into single buffers. Since the primary source of `wgpu` overhead is from validation and synchronization when switching buffers, this significantly improves Bevy's rendering performance on many scenes. This patch is a more flexible version of bevyengine#13218, which also used slabs. Unlike bevyengine#13218, which used slabs of a fixed size, this commit implements slabs that start small and can grow. In addition to reducing memory usage, supporting slab growth reduces the number of vertex and index buffer switches that need to happen during rendering, leading to improved performance. To prevent pathological fragmentation behavior, slabs are capped to a maximum size, and mesh arrays that are too large get their own dedicated slabs. As an additional improvement over bevyengine#13218, this commit allows the application to customize all allocator heuristics. The `MeshAllocatorSettings` resource contains values that adjust the minimum and maximum slab sizes, the cutoff point at which meshes get their own dedicated slabs, and the rate at which slabs grow. Hopefully-sensible defaults have been chosen for each value. Unfortunately, WebGL 2 doesn't support the *base vertex* feature, which is necessary to pack vertex arrays from different meshes into the same buffer. `wgpu` represents this restriction as the downlevel flag `BASE_VERTEX`. This patch detects that bit and ensures that all vertex buffers get dedicated slabs on that platform. Even on WebGL 2, though, we can combine all *index* arrays into single buffers to reduce buffer changes, and we do so.
pcwalton · Jul 9, 2024 · 6403a60 · 6403a60
1 parent 5ffdc0c
commit 6403a60
Show file tree

Hide file tree

Showing 10 changed files with 1,150 additions and 71 deletions.
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
@@ -1,5 +1,6 @@
 use std::mem;
 
+use allocator::MeshAllocator;
 use bevy_asset::{load_internal_asset, AssetId};
 use bevy_core_pipeline::{
     core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT},
@@ -1210,6 +1211,7 @@ impl GetBatchData for MeshPipeline {
         SRes<RenderMeshInstances>,
         SRes<RenderLightmaps>,
         SRes<RenderAssets<GpuMesh>>,
+        SRes<MeshAllocator>,
     );
     // The material bind group ID, the mesh ID, and the lightmap ID,
     // respectively.
@@ -1218,7 +1220,7 @@ impl GetBatchData for MeshPipeline {
     type BufferData = MeshUniform;
 
     fn get_batch_data(
-        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -1249,7 +1251,7 @@ impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
     fn get_index_and_compare_data(
-        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
@@ -1275,7 +1277,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_batch_data(
-        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -1294,7 +1296,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_index(
-        (mesh_instances, _, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
@@ -1312,14 +1314,15 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_batch_indirect_parameters_index(
-        (mesh_instances, _, meshes): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, meshes, mesh_allocator): &SystemParamItem<Self::Param>,
         indirect_parameters_buffer: &mut IndirectParametersBuffer,
         entity: Entity,
         instance_index: u32,
     ) -> Option<NonMaxU32> {
         get_batch_indirect_parameters_index(
             mesh_instances,
             meshes,
+            mesh_allocator,
             indirect_parameters_buffer,
             entity,
             instance_index,
@@ -1333,6 +1336,7 @@ impl GetFullBatchData for MeshPipeline {
 fn get_batch_indirect_parameters_index(
     mesh_instances: &RenderMeshInstances,
     meshes: &RenderAssets<GpuMesh>,
+    mesh_allocator: &MeshAllocator,
     indirect_parameters_buffer: &mut IndirectParametersBuffer,
     entity: Entity,
     instance_index: u32,
@@ -1348,24 +1352,29 @@ fn get_batch_indirect_parameters_index(
 
     let mesh_instance = mesh_instances.get(&entity)?;
     let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
+    let vertex_buffer_slice = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)?;
 
     // Note that `IndirectParameters` covers both of these structures, even
     // though they actually have distinct layouts. See the comment above that
     // type for more information.
     let indirect_parameters = match mesh.buffer_info {
         GpuBufferInfo::Indexed {
             count: index_count, ..
-        } => IndirectParameters {
-            vertex_or_index_count: index_count,
-            instance_count: 0,
-            first_vertex: 0,
-            base_vertex_or_first_instance: 0,
-            first_instance: instance_index,
-        },
+        } => {
+            let index_buffer_slice =
+                mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)?;
+            IndirectParameters {
+                vertex_or_index_count: index_count,
+                instance_count: 0,
+                first_vertex_or_first_index: index_buffer_slice.range.start,
+                base_vertex_or_first_instance: vertex_buffer_slice.range.start,
+                first_instance: instance_index,
+            }
+        }
         GpuBufferInfo::NonIndexed => IndirectParameters {
             vertex_or_index_count: mesh.vertex_count,
             instance_count: 0,
-            first_vertex: 0,
+            first_vertex_or_first_index: vertex_buffer_slice.range.start,
             base_vertex_or_first_instance: instance_index,
             first_instance: instance_index,
         },
@@ -2242,6 +2251,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         SRes<RenderMeshInstances>,
         SRes<IndirectParametersBuffer>,
         SRes<PipelineCache>,
+        SRes<MeshAllocator>,
         Option<SRes<PreprocessPipelines>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
@@ -2251,7 +2261,14 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         item: &P,
         has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
+        (
+            meshes,
+            mesh_instances,
+            indirect_parameters_buffer,
+            pipeline_cache,
+            mesh_allocator,
+            preprocess_pipelines,
+        ): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         // If we're using GPU preprocessing, then we're dependent on that
@@ -2268,13 +2285,17 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
         let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
+        let mesh_allocator = mesh_allocator.into_inner();
 
         let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
         };
         let Some(gpu_mesh) = meshes.get(mesh_asset_id) else {
             return RenderCommandResult::Failure;
         };
+        let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(&mesh_asset_id) else {
+            return RenderCommandResult::Failure;
+        };
 
         // Calculate the indirect offset, and look up the buffer.
         let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
@@ -2291,21 +2312,31 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             },
         };
 
-        pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
+        pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..));
 
         let batch_range = item.batch_range();
 
         // Draw either directly or indirectly, as appropriate.
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
-                buffer,
                 index_format,
                 count,
             } => {
-                pass.set_index_buffer(buffer.slice(..), 0, *index_format);
+                let Some(index_buffer_slice) = mesh_allocator.mesh_index_slice(&mesh_asset_id)
+                else {
+                    return RenderCommandResult::Failure;
+                };
+
+                pass.set_index_buffer(index_buffer_slice.buffer.slice(..), 0, *index_format);
+
                 match indirect_parameters {
                     None => {
-                        pass.draw_indexed(0..*count, 0, batch_range.clone());
+                        pass.draw_indexed(
+                            index_buffer_slice.range.start
+                                ..(index_buffer_slice.range.start + *count),
+                            vertex_buffer_slice.range.start as i32,
+                            batch_range.clone(),
+                        );
                     }
                     Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
                         .draw_indexed_indirect(

diff --git a/crates/bevy_render/Cargo.toml b/crates/bevy_render/Cargo.toml
@@ -101,6 +101,7 @@ profiling = { version = "1", features = [
 async-channel = "2.2.0"
 nonmax = "0.5"
 smallvec = { version = "1.11", features = ["const_new"] }
+offset-allocator = "0.2"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # Omit the `glsl` feature in non-WebAssembly by default.

diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -186,7 +186,7 @@ pub struct IndirectParameters {
     pub instance_count: u32,
 
     /// The index of the first vertex we're to draw.
-    pub first_vertex: u32,
+    pub first_vertex_or_first_index: u32,
 
     /// For `ArrayIndirectParameters`, `first_instance`; for
     /// `ElementIndirectParameters`, `base_vertex`.