From 8669905245e855a0ac2586f3112a1a41f450452a Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 2 May 2024 15:40:15 -0700
Subject: [PATCH 1/4] Pack multiple meshes into vertex and index buffers.

The underlying allocation algorithm is [`offset-allocator`], which is a port
of [Sebastian Aaltonen's `OffsetAllocator`]. It's a fast, simple hard real
time allocator in the two-level segregated fit family.

Allocations are divided into two categories: *regular* and *large*. Regular
allocations go into one of the shared slabs managed by an allocator. Large
allocations get their own individual slabs. Due to platform limitations, on
WebGL 2 all vertex buffers are considered large allocations that get their
own slabs; however, index buffers can still be packed together. The slab
size is 32 MB by default, but the developer can adjust it manually.

The mesh bin key and compare data have been reworked so that the slab
IDs are compared first. That way, meshes that the same vertex and index
buffers tend to be drawn together. Note that this only works well for
opaque meshes; transparent meshes must be sorted into draw order, so
there's less opportunity for grouping.

The purpose of packing meshes together is to reduce the number of times
vertex and index buffers have to be re-bound, which is expensive. In the
future, we'd like to use *multi-draw*, which allows us to draw multiple
meshes with a single drawcall, as long as they're in the same buffers.
Thus, this patch paves the way toward multi-draw, and with it a
GPU-driven pipeline

Even without multi-draw, this patch results in significant performance
improvements. For me, the command submission time (i.e. GPU time plus
driver and `wgpu` overhead) for Bistro goes from 4.07ms to 1.42ms
without shadows (2.8x speedup); with shadows it goes from 6.91ms to
2.62ms (2.45x speedup). The number of vertex and index buffer switches
in Bistro is reduced from approximately 3,600 to 927, with the vast
majority of the remaining switches due to the transparent pass.

[`offset-allocator`]: https://github.com/pcwalton/offset-allocator/

[Sebastian Aaltonen's `OffsetAllocator`]: https://github.com/sebbbi/OffsetAllocator/
---
 crates/bevy_core_pipeline/src/core_3d/mod.rs  |  49 +-
 crates/bevy_core_pipeline/src/prepass/mod.rs  |  14 +-
 crates/bevy_pbr/src/material.rs               |   8 +-
 crates/bevy_pbr/src/prepass/mod.rs            |   4 +
 crates/bevy_pbr/src/render/light.rs           |  12 +-
 crates/bevy_pbr/src/render/mesh.rs            |  61 +-
 crates/bevy_render/Cargo.toml                 |   2 +
 crates/bevy_render/src/allocator.rs           | 563 ++++++++++++++++++
 .../src/batching/gpu_preprocessing.rs         |   2 +-
 crates/bevy_render/src/lib.rs                 |   3 +
 crates/bevy_render/src/mesh/mesh/mod.rs       |  97 ++-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |  33 +-
 examples/shader/shader_instancing.rs          |  33 +-
 13 files changed, 810 insertions(+), 71 deletions(-)
 create mode 100644 crates/bevy_render/src/allocator.rs

diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs
index e5cd6c049173f..1b91b80d3a206 100644
--- a/crates/bevy_core_pipeline/src/core_3d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs
@@ -53,7 +53,7 @@ use bevy_math::FloatOrd;
 use bevy_render::{
     camera::{Camera, ExtractedCamera},
     extract_component::ExtractComponentPlugin,
-    mesh::Mesh,
+    mesh::{Mesh, MeshSlabHash},
     prelude::Msaa,
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
@@ -72,6 +72,8 @@ use bevy_render::{
 };
 use bevy_utils::{tracing::warn, HashMap};
 
+use bitflags::bitflags;
+
 use crate::{
     core_3d::main_transmissive_pass_3d_node::MainTransmissivePass3dNode,
     deferred::{
@@ -191,22 +193,57 @@ pub struct Opaque3d {
 /// Data that must be identical in order to batch meshes together.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Opaque3dBinKey {
+    /// Various flags, with the [`MeshSlabHash`] in the top bits.
+    ///
+    /// We want this to be first to minimize IBO and VBO changes. See the
+    /// comments in [`MeshSlabHash`] for more details.
+    pub flags: MeshCompareFlags,
+
     /// The identifier of the render pipeline.
     pub pipeline: CachedRenderPipelineId,
 
     /// The function used to draw.
     pub draw_function: DrawFunctionId,
 
-    /// The mesh.
-    pub asset_id: AssetId<Mesh>,
-
     /// The ID of a bind group specific to the material.
     ///
     /// In the case of PBR, this is the `MaterialBindGroupId`.
     pub material_bind_group_id: Option<BindGroupId>,
 
-    /// The lightmap, if present.
-    pub lightmap_image: Option<AssetId<Image>>,
+    /// The lightmap, if present; if not present, this is `AssetId::default()`.
+    pub lightmap_image: AssetId<Image>,
+
+    /// The mesh.
+    ///
+    /// Although we don't have multidraw capability yet, we place this at the
+    /// end to maximize multidraw opportunities in the future.
+    pub asset_id: AssetId<Mesh>,
+}
+
+bitflags! {
+    /// Flags that are used as part of the decision to batch or not batch a
+    /// mesh.
+    ///
+    /// This 8-bit flag field is concatenated with [`MeshSlabHash`] to form the
+    /// `flags` field of a bin key or compare data.
+    #[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Hash)]
+    pub struct MeshCompareFlags: u32 {
+        /// A lightmap is present.
+        const HAS_LIGHTMAP = 0x0000_0001;
+    }
+}
+
+impl MeshCompareFlags {
+    /// Creates a new [`MeshCompareFlags`] for a mesh and corresponding slab
+    /// hash.
+    ///
+    /// `has_lightmap` should be true if the mesh has a lightmap.
+    pub fn new(has_lightmap: bool, slab_hash: MeshSlabHash) -> MeshCompareFlags {
+        let mut flags = MeshCompareFlags::empty();
+        flags.set(MeshCompareFlags::HAS_LIGHTMAP, has_lightmap);
+        flags.insert(MeshCompareFlags::from_bits_retain(*slab_hash));
+        flags
+    }
 }
 
 impl PhaseItem for Opaque3d {
diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs
index 88a6ac3970539..052dcf3388245 100644
--- a/crates/bevy_core_pipeline/src/prepass/mod.rs
+++ b/crates/bevy_core_pipeline/src/prepass/mod.rs
@@ -33,7 +33,7 @@ use bevy_asset::AssetId;
 use bevy_ecs::prelude::*;
 use bevy_reflect::Reflect;
 use bevy_render::{
-    mesh::Mesh,
+    mesh::{Mesh, MeshSlabHash},
     render_phase::{
         BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
         PhaseItemExtraIndex,
@@ -128,19 +128,25 @@ pub struct Opaque3dPrepass {
 /// The data used to bin each opaque 3D mesh in the prepass and deferred pass.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct OpaqueNoLightmap3dBinKey {
+    /// Various flags, with the [`MeshSlabHash`] in the top bits.
+    ///
+    /// We want this to be first to minimize IBO and VBO changes. See the
+    /// comments in [`MeshSlabHash`] for more details.
+    pub slab_hash: MeshSlabHash,
+
     /// The ID of the GPU pipeline.
     pub pipeline: CachedRenderPipelineId,
 
     /// The function used to draw the mesh.
     pub draw_function: DrawFunctionId,
 
-    /// The ID of the mesh.
-    pub asset_id: AssetId<Mesh>,
-
     /// The ID of a bind group specific to the material.
     ///
     /// In the case of PBR, this is the `MaterialBindGroupId`.
     pub material_bind_group_id: Option<BindGroupId>,
+
+    /// The ID of the mesh.
+    pub asset_id: AssetId<Mesh>,
 }
 
 impl PhaseItem for Opaque3dPrepass {
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index ede7b1672a9a0..0ecf6b88a66d7 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -7,8 +7,8 @@ use crate::*;
 use bevy_asset::{Asset, AssetId, AssetServer};
 use bevy_core_pipeline::{
     core_3d::{
-        AlphaMask3d, Camera3d, Opaque3d, Opaque3dBinKey, ScreenSpaceTransmissionQuality,
-        Transmissive3d, Transparent3d,
+        AlphaMask3d, Camera3d, MeshCompareFlags, Opaque3d, Opaque3dBinKey,
+        ScreenSpaceTransmissionQuality, Transmissive3d, Transparent3d,
     },
     prepass::{
         DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass, OpaqueNoLightmap3dBinKey,
@@ -723,7 +723,8 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             asset_id: mesh_instance.mesh_asset_id,
                             material_bind_group_id: material.get_bind_group_id().0,
-                            lightmap_image,
+                            lightmap_image: lightmap_image.unwrap_or_default(),
+                            flags: MeshCompareFlags::new(lightmap_image.is_some(), mesh.slab_hash),
                         };
                         opaque_phase.add(bin_key, *visible_entity, mesh_instance.should_batch());
                     }
@@ -746,6 +747,7 @@ pub fn queue_material_meshes<M: Material>(
                             draw_function: draw_alpha_mask_pbr,
                             pipeline: pipeline_id,
                             asset_id: mesh_instance.mesh_asset_id,
+                            slab_hash: mesh.slab_hash,
                             material_bind_group_id: material.get_bind_group_id().0,
                         };
                         alpha_mask_phase.add(
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index f4e6b59c74bac..adef90a6263c9 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -860,6 +860,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
                                 pipeline: pipeline_id,
                                 asset_id: mesh_instance.mesh_asset_id,
                                 material_bind_group_id: material.get_bind_group_id().0,
+                                slab_hash: mesh.slab_hash,
                             },
                             *visible_entity,
                             mesh_instance.should_batch(),
@@ -871,6 +872,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
                                 pipeline: pipeline_id,
                                 asset_id: mesh_instance.mesh_asset_id,
                                 material_bind_group_id: material.get_bind_group_id().0,
+                                slab_hash: mesh.slab_hash,
                             },
                             *visible_entity,
                             mesh_instance.should_batch(),
@@ -885,6 +887,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
                             draw_function: alpha_mask_draw_deferred,
                             asset_id: mesh_instance.mesh_asset_id,
                             material_bind_group_id: material.get_bind_group_id().0,
+                            slab_hash: mesh.slab_hash,
                         };
                         alpha_mask_deferred_phase.as_mut().unwrap().add(
                             bin_key,
@@ -897,6 +900,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
                             draw_function: alpha_mask_draw_prepass,
                             asset_id: mesh_instance.mesh_asset_id,
                             material_bind_group_id: material.get_bind_group_id().0,
+                            slab_hash: mesh.slab_hash,
                         };
                         alpha_mask_phase.add(
                             bin_key,
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 74e340c01a33e..1f7c3e1e37e86 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -3,7 +3,7 @@ use bevy_core_pipeline::core_3d::{Transparent3d, CORE_3D_DEPTH_FORMAT};
 use bevy_ecs::prelude::*;
 use bevy_ecs::{entity::EntityHashMap, system::lifetimeless::Read};
 use bevy_math::{Mat4, UVec3, UVec4, Vec2, Vec3, Vec3Swizzles, Vec4, Vec4Swizzles};
-use bevy_render::mesh::Mesh;
+use bevy_render::mesh::{Mesh, MeshSlabHash};
 use bevy_render::{
     camera::Camera,
     diagnostic::RecordDiagnostics,
@@ -1719,6 +1719,7 @@ pub fn queue_shadows<M: Material>(
                     ShadowBinKey {
                         draw_function: draw_shadow_mesh,
                         pipeline: pipeline_id,
+                        slab_hash: mesh.slab_hash,
                         asset_id: mesh_instance.mesh_asset_id,
                     },
                     entity,
@@ -1736,8 +1737,15 @@ pub struct Shadow {
     pub extra_index: PhaseItemExtraIndex,
 }
 
+/// The data used to bin each mesh in the shadow map pass.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct ShadowBinKey {
+    /// Various flags, with the [`MeshSlabHash`] in the top bits.
+    ///
+    /// We want this to be first to minimize IBO and VBO changes. See the
+    /// comments in [`MeshSlabHash`] for more details.
+    pub slab_hash: MeshSlabHash,
+
     /// The identifier of the render pipeline.
     pub pipeline: CachedRenderPipelineId,
 
@@ -1745,6 +1753,8 @@ pub struct ShadowBinKey {
     pub draw_function: DrawFunctionId,
 
     /// The mesh.
+    ///
+    /// This is at the end to minimize binding changes.
     pub asset_id: AssetId<Mesh>,
 }
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 9433ac46c19be..ca3aa5594ede1 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -14,6 +14,7 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
+    allocator::GpuAllocator,
     batching::{
         gpu_preprocessing::{
             self, GpuPreprocessingSupport, IndirectParameters, IndirectParametersBuffer,
@@ -1298,23 +1299,30 @@ fn get_batch_indirect_parameters_index(
     let mesh_instance = mesh_instances.get(&entity)?;
     let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
 
+    let vertex_offset = mesh.vertex_buffer.offset();
+
     // Note that `IndirectParameters` covers both of these structures, even
     // though they actually have distinct layouts. See the comment above that
     // type for more information.
     let indirect_parameters = match mesh.buffer_info {
         GpuBufferInfo::Indexed {
-            count: index_count, ..
-        } => IndirectParameters {
-            vertex_or_index_count: index_count,
-            instance_count: 0,
-            first_vertex: 0,
-            base_vertex_or_first_instance: 0,
-            first_instance: instance_index,
-        },
+            ref allocation,
+            count: index_count,
+            ..
+        } => {
+            let index_offset = allocation.offset();
+            IndirectParameters {
+                vertex_or_index_count: index_count,
+                instance_count: 0,
+                first_vertex_or_index: index_offset,
+                base_vertex_or_first_instance: vertex_offset,
+                first_instance: instance_index,
+            }
+        }
         GpuBufferInfo::NonIndexed => IndirectParameters {
             vertex_or_index_count: mesh.vertex_count,
             instance_count: 0,
-            first_vertex: 0,
+            first_vertex_or_index: vertex_offset,
             base_vertex_or_first_instance: instance_index,
             first_instance: instance_index,
         },
@@ -2043,6 +2051,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         SRes<RenderMeshInstances>,
         SRes<IndirectParametersBuffer>,
         SRes<PipelineCache>,
+        SRes<GpuAllocator>,
         Option<SRes<PreprocessPipelines>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
@@ -2052,7 +2061,14 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         item: &P,
         has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
+        (
+            meshes,
+            mesh_instances,
+            indirect_parameters_buffer,
+            pipeline_cache,
+            allocator,
+            preprocess_pipelines,
+        ): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         // If we're using GPU preprocessing, then we're dependent on that
@@ -2069,6 +2085,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
         let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
+        let allocator = allocator.into_inner();
 
         let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
@@ -2092,21 +2109,32 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             },
         };
 
-        pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
+        let vertex_buffer = allocator.buffer(&gpu_mesh.vertex_buffer);
+        let vertex_offset = gpu_mesh.vertex_buffer.offset();
+
+        pass.set_vertex_buffer(0, vertex_buffer.slice(..));
 
         let batch_range = item.batch_range();
 
         // Draw either directly or indirectly, as appropriate.
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
-                buffer,
+                allocation,
                 index_format,
                 count,
             } => {
-                pass.set_index_buffer(buffer.slice(..), 0, *index_format);
+                let index_buffer = allocator.buffer(allocation);
+                let index_offset = allocation.offset();
+
+                pass.set_index_buffer(index_buffer.slice(..), 0, *index_format);
+
                 match indirect_parameters {
                     None => {
-                        pass.draw_indexed(0..*count, 0, batch_range.clone());
+                        pass.draw_indexed(
+                            index_offset..(index_offset + *count),
+                            vertex_offset as i32,
+                            batch_range.clone(),
+                        );
                     }
                     Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
                         .draw_indexed_indirect(
@@ -2117,7 +2145,10 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             }
             GpuBufferInfo::NonIndexed => match indirect_parameters {
                 None => {
-                    pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                    pass.draw(
+                        vertex_offset..(vertex_offset + gpu_mesh.vertex_count),
+                        batch_range.clone(),
+                    );
                 }
                 Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
                     pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);
diff --git a/crates/bevy_render/Cargo.toml b/crates/bevy_render/Cargo.toml
index 82d16c933733a..3939ef75c3cf1 100644
--- a/crates/bevy_render/Cargo.toml
+++ b/crates/bevy_render/Cargo.toml
@@ -101,6 +101,8 @@ profiling = { version = "1", features = [
 async-channel = "2.2.0"
 nonmax = "0.5"
 smallvec = "1.11"
+offset-allocator = "0.1"
+slotmap = "1"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # Omit the `glsl` feature in non-WebAssembly by default.
diff --git a/crates/bevy_render/src/allocator.rs b/crates/bevy_render/src/allocator.rs
new file mode 100644
index 0000000000000..b7642fd405ea5
--- /dev/null
+++ b/crates/bevy_render/src/allocator.rs
@@ -0,0 +1,563 @@
+//! An allocator that divides up vertex and index buffers so that multiple
+//! meshes can be packed together.
+//!
+//! The underlying allocation algorithm is [`offset-allocator`], which is a port
+//! of [Sebastian Aaltonen's `OffsetAllocator`]. It's a fast, simple hard real
+//! time allocator in the two-level segregated fit family.
+//!
+//! Allocations are divided into two categories: *regular* and *large*. Regular
+//! allocations go into one of the shared slabs managed by an allocator. Large
+//! allocations get their own individual slabs. Due to platform limitations, on
+//! WebGL 2 all vertex buffers are considered large allocations that get their
+//! own slabs.
+//!
+//! The purpose of packing meshes together is to reduce the number of times
+//! vertex and index buffers have to be re-bound, which is expensive.
+//!
+//! [`offset-allocator`]: https://github.com/pcwalton/offset-allocator/
+//! [Sebastian Aaltonen's `OffsetAllocator`]: https://github.com/sebbbi/OffsetAllocator
+
+use std::{
+    fmt::{self, Debug, Display, Formatter},
+    hash::{DefaultHasher, Hash, Hasher},
+    iter,
+    sync::{Arc, RwLock},
+};
+
+use bevy_app::{App, Plugin};
+use bevy_derive::{Deref, DerefMut};
+use bevy_ecs::{
+    schedule::IntoSystemConfigs,
+    system::{ResMut, Resource},
+};
+use bevy_time::common_conditions::on_timer;
+use bevy_utils::{hashbrown::HashMap, prelude::default, tracing::error, Duration};
+use offset_allocator::{Allocation, Allocator};
+use slotmap::{new_key_type, SlotMap};
+use wgpu::{
+    util::BufferInitDescriptor, BufferDescriptor, BufferUsages, DownlevelFlags, IndexFormat,
+};
+
+use crate::{
+    mesh::MeshVertexBufferLayoutRef,
+    render_resource::Buffer,
+    renderer::{RenderAdapter, RenderDevice, RenderQueue},
+    Render, RenderApp,
+};
+
+/// How often we sweep unused allocations.
+const SWEEP_INTERVAL: Duration = Duration::from_secs(10);
+
+/// The default size of a slab, in bytes.
+const DEFAULT_SLAB_SIZE: u64 = 32 * 1024 * 1024;
+
+/// A plugin that provides the GPU memory allocator.
+pub struct GpuAllocatorPlugin {
+    /// The size of a slab.
+    ///
+    /// By default, this is 32 MB.
+    pub slab_size: u64,
+}
+
+/// Manages allocations in GPU buffers.
+#[derive(Resource, Clone)]
+pub struct GpuAllocator {
+    slabs: HashMap<SlabId, Buffer>,
+    slab_size: u64,
+    next_slab_id: SlabId,
+    classes: HashMap<GpuAllocationClass, GpuClassAllocator>,
+    adapter_downlevel_flags: DownlevelFlags,
+}
+
+#[derive(Clone, Default, Deref, DerefMut)]
+struct GpuClassAllocator(Arc<RwLock<GpuClassAllocatorData>>);
+
+#[derive(Default)]
+struct GpuClassAllocatorData {
+    regular_slabs: SlotMap<RegularSlabId, (Allocator, SlabId)>,
+    large_slabs: SlotMap<LargeSlabId, SlabId>,
+    free_large_slabs: Vec<SlabId>,
+}
+
+/// The type of a GPU buffer. Each class has its own allocator.
+///
+/// Only allocations of the same class can coexist in a single buffer.
+///
+/// Unlike regular CPU memory, GPU buffers require allocation with specific
+/// *usages*, which restrict how they can be used. Additionally, the APIs place
+/// additional restrictions: for example, because drawcalls require us to
+/// specify the initial vertex *index*, and not the initial vertex *byte
+/// position*, we must only group meshes with identical vertex buffer layouts
+/// into the same buffer.
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub enum GpuAllocationClass {
+    /// A buffer for holding mesh vertex data conforming to the given layout.
+    VertexBuffer(MeshVertexBufferLayoutRef),
+    /// A buffer for holding mesh index data, with the given data type.
+    IndexBuffer(IndexFormat),
+}
+
+/// Identifies a single buffer.
+///
+/// We don't use a [`SlotMap`] for these because we want
+/// monotonically-increasing integers to achieve a consistent distribution range
+/// in the [`crate::mesh::MeshSlabHash`]. If we used a [`SlotMap`], then we
+/// could have unpredictable collisions, resulting in harder-to-diagnose
+/// performance issues.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Default, Deref, DerefMut, Debug)]
+#[repr(transparent)]
+pub struct SlabId(pub u32);
+
+new_key_type! {
+    /// The index of a regular slab in the
+    /// `GpuClassAllocatorData::regular_slabs` index.
+    ///
+    /// Note that this is distinct from a [`SlabId`].
+    pub struct RegularSlabId;
+}
+
+new_key_type! {
+    /// The index of a large slab in the `GpuClassAllocatorData::large_slabs`
+    /// index.
+    ///
+    /// Note that this is distinct from a [`SlabId`].
+    pub struct LargeSlabId;
+}
+
+/// A handle to an allocation.
+///
+/// This information can be used to look up the buffer and offset.
+///
+/// When this handle is dropped, the allocation is automatically freed. This
+/// type isn't clonable; if you want to hand out multiple references, wrap it in
+/// an [`Arc`].
+pub struct GpuAllocation {
+    /// The ID of the allocation in the allocation tables.
+    allocation_id: GpuAllocationId,
+
+    /// The ID of the buffer in which this allocation lives.
+    ///
+    /// This could be fetched from the allocator, but caching it here is faster.
+    slab_id: SlabId,
+
+    /// This is the offset in `unit_size` elements. It may differ from the
+    /// offset in `Allocation` because the one in `Allocation` is in multiples
+    /// of `aligned_unit_size`, while this one is in multiples of `unit_size`.
+    offset: u32,
+
+    /// A handle to the allocation class that this comes from.
+    class_allocator: GpuClassAllocator,
+}
+
+/// Identifies an allocation in the allocation tables.
+#[derive(Clone)]
+enum GpuAllocationId {
+    /// This allocation is potentially grouped with others as part of a slab.
+    Regular(RegularSlabId, Allocation),
+    /// This allocation has its own slab.
+    Large(LargeSlabId),
+}
+
+impl Plugin for GpuAllocatorPlugin {
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(Render, free_unused_slabs.run_if(on_timer(SWEEP_INTERVAL)));
+    }
+
+    fn finish(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        let render_adapter = render_app.world().resource::<RenderAdapter>();
+        let adapter_downlevel_flags = render_adapter.get_downlevel_capabilities().flags;
+
+        render_app.insert_resource(GpuAllocator {
+            slabs: HashMap::default(),
+            slab_size: self.slab_size,
+            next_slab_id: SlabId::default(),
+            classes: HashMap::default(),
+            adapter_downlevel_flags,
+        });
+    }
+}
+
+impl Default for GpuAllocatorPlugin {
+    fn default() -> Self {
+        Self {
+            slab_size: DEFAULT_SLAB_SIZE,
+        }
+    }
+}
+
+impl Drop for GpuAllocation {
+    fn drop(&mut self) {
+        // This should never happen, but if it does, we're in a destructor, so
+        // let's not abort the process.
+        let Ok(mut class_allocator) = self.class_allocator.write() else {
+            error!("Couldn't lock the class allocator; just leaking");
+            return;
+        };
+
+        // Free the allocation.
+        match self.allocation_id {
+            GpuAllocationId::Regular(regular_slab_id, allocation) => {
+                // Find the slab that this allocation came from.
+                let Some((ref mut allocator, _)) =
+                    class_allocator.regular_slabs.get_mut(regular_slab_id)
+                else {
+                    error!(
+                        "Couldn't find the slab that this allocation came from; just leaking. \
+                        (Is the allocator corrupt?)"
+                    );
+                    return;
+                };
+
+                // Tell the allocator to mark this allocation as free.
+                allocator.free(allocation);
+            }
+
+            GpuAllocationId::Large(large_slab_id) => {
+                // Find the slab that this allocation came from.
+                let Some(slab) = class_allocator.large_slabs.remove(large_slab_id) else {
+                    error!(
+                        "Couldn't find the slab that this allocation came from; just leaking. \
+                        (Is the allocator corrupt?)"
+                    );
+                    return;
+                };
+
+                // Mark the slab as free.
+                class_allocator.free_large_slabs.push(slab);
+            }
+        }
+    }
+}
+
+impl GpuAllocationClass {
+    /// Returns the number of bytes of storage that a single element of this
+    /// class uses.
+    fn unit_size(&self) -> u32 {
+        match *self {
+            GpuAllocationClass::VertexBuffer(ref layout) => layout.0.layout().array_stride as u32,
+            GpuAllocationClass::IndexBuffer(IndexFormat::Uint16) => 2,
+            GpuAllocationClass::IndexBuffer(IndexFormat::Uint32) => 4,
+        }
+    }
+
+    /// Returns the number of bytes of storage that a single element of this
+    /// class uses, rounded up to the nearest 4 bytes.
+    ///
+    /// This exists because copies in `wgpu` must begin and end on 4-byte
+    /// boundaries, so we have to pad out allocations appropriately.
+    fn aligned_unit_size(&self) -> u32 {
+        let mut unit_size = self.unit_size();
+        if unit_size % 4 != 0 {
+            unit_size += 4 - unit_size % 4;
+        }
+        unit_size
+    }
+}
+
+impl GpuAllocator {
+    /// Returns the slab that the given allocation is stored in.
+    pub fn buffer(&self, allocation: &GpuAllocation) -> &Buffer {
+        &self.slabs[&allocation.slab_id]
+    }
+}
+
+impl GpuAllocation {
+    /// Returns the location within the slab of the allocation, *in elements,
+    /// not in bytes*.
+    pub fn offset(&self) -> u32 {
+        self.offset
+    }
+
+    /// Returns the ID of the slab that this allocation is stored in.
+    pub fn slab_id(&self) -> SlabId {
+        self.slab_id
+    }
+}
+
+impl GpuAllocator {
+    /// Allocates memory of the given [`GpuAllocationClass`], and copies data
+    /// into it.
+    ///
+    /// New slabs are automatically allocated, so this method can't fail.
+    pub fn allocate_with(
+        &mut self,
+        render_device: &RenderDevice,
+        render_queue: &RenderQueue,
+        class: &GpuAllocationClass,
+        contents: &[u8],
+    ) -> GpuAllocation {
+        // If this is going to overflow a slab, give it
+        if self.class_requires_large_allocation(class) || (contents.len() as u64) > self.slab_size {
+            return self.allocate_large_with(render_device, class, contents);
+        }
+
+        let mut found_allocation = None;
+
+        {
+            // Create the class allocator if we need to.
+            let class_allocator = self.classes.entry(class.clone()).or_insert_with(default);
+            let mut class_allocator_data = class_allocator
+                .write()
+                .expect("Failed to lock the class allocator for writing");
+
+            // Align up to the nearest 4 bytes so we can copy in.
+            let (unit_size, aligned_unit_size) = (class.unit_size(), class.aligned_unit_size());
+            let aligned_contents_size = contents.len().div_ceil(aligned_unit_size as usize) as u32;
+
+            // Try to allocate in one of our existing slabs with a simple first-fit
+            // algorithm.
+            for (regular_slab_id, (ref mut allocator, slab_id)) in
+                class_allocator_data.regular_slabs.iter_mut()
+            {
+                if let Some(allocation) = allocator.allocate(aligned_contents_size) {
+                    found_allocation = Some(GpuAllocation {
+                        allocation_id: GpuAllocationId::Regular(regular_slab_id, allocation),
+                        offset: (allocation.offset as u64 * aligned_unit_size as u64
+                            / unit_size as u64) as u32,
+                        slab_id: *slab_id,
+                        class_allocator: class_allocator.clone(),
+                    });
+                    break;
+                }
+            }
+        }
+
+        // If we couldn't allocate in any of our existing slabs, create a new
+        // one.
+        let allocation = found_allocation
+            .unwrap_or_else(|| self.allocate_new_regular_slab(render_device, class, contents));
+
+        // Copy data in. Pad out data to be a multiple of 4 bytes in size if
+        // necessary. (It's unfortunate that we incur a copy in that case…)
+        let buffer = &self.slabs[&allocation.slab_id];
+        let byte_offset = allocation.offset() as u64 * class.unit_size() as u64;
+        if contents.len() % 4 == 0 {
+            render_queue.write_buffer(buffer, byte_offset, contents);
+        } else {
+            let contents = contents
+                .iter()
+                .copied()
+                .chain(iter::repeat(0).take(4 - (contents.len() % 4)))
+                .collect::<Vec<_>>();
+
+            render_queue.write_buffer(buffer, byte_offset, &contents);
+        };
+
+        allocation
+    }
+
+    /// Allocates memory of the given [`GpuAllocationClass`], giving it its own
+    /// slab.
+    ///
+    /// This is used for allocations that overflow the maximum size of a single
+    /// slab, or for allocations that can't be allocated together due to platform limitations.
+    fn allocate_large_with(
+        &mut self,
+        render_device: &RenderDevice,
+        class: &GpuAllocationClass,
+        contents: &[u8],
+    ) -> GpuAllocation {
+        // Create a class if we need to.
+        let class_allocator = self.classes.entry(class.clone()).or_insert_with(default);
+        let mut class_allocator_data = class_allocator
+            .write()
+            .expect("Failed to lock the class allocator for writing");
+
+        // Try to see if we can reuse an existing large slab.
+        let mut slab_id = None;
+        for slab_index in 0..class_allocator_data.free_large_slabs.len() {
+            if self.slabs[&class_allocator_data.free_large_slabs[slab_index]].size()
+                >= contents.len() as u64 * 4
+            {
+                slab_id = Some(
+                    class_allocator_data
+                        .free_large_slabs
+                        .swap_remove(slab_index),
+                );
+                break;
+            }
+        }
+
+        // If we couldn't, create a new slab.
+        let slab_id = slab_id.unwrap_or_else(|| {
+            let slab_id = self.next_slab_id;
+            *self.next_slab_id += 1;
+            self.slabs.insert(
+                slab_id,
+                render_device.create_buffer_with_data(&BufferInitDescriptor {
+                    label: Some(&format!("large slab ({})", class)),
+                    contents,
+                    usage: class.buffer_usage(),
+                }),
+            );
+            slab_id
+        });
+
+        // Create a large slab ID so we can track this allocation, and return.
+        let large_slab_id = class_allocator_data.large_slabs.insert(slab_id);
+
+        GpuAllocation {
+            allocation_id: GpuAllocationId::Large(large_slab_id),
+            offset: 0,
+            slab_id,
+            class_allocator: class_allocator.clone(),
+        }
+    }
+
+    /// Creates a new regular slab containing a single allocation and copies data into it.
+    fn allocate_new_regular_slab(
+        &mut self,
+        render_device: &RenderDevice,
+        class: &GpuAllocationClass,
+        contents: &[u8],
+    ) -> GpuAllocation {
+        // Look up the per-class allocator.
+        let class_allocator = &self.classes[class];
+        let mut class_allocator_data = class_allocator
+            .write()
+            .expect("Failed to lock the class allocator for writing");
+
+        let (unit_size, aligned_unit_size) = (class.unit_size(), class.aligned_unit_size());
+        let aligned_contents_size = contents.len().div_ceil(aligned_unit_size as usize) as u32;
+
+        // Create the buffer. We the buffer to have `COPY_DST` so we can copy
+        // data into it.
+        let buffer = render_device.create_buffer(&BufferDescriptor {
+            label: Some(&format!("regular slab ({})", class)),
+            size: self.slab_size,
+            usage: class.buffer_usage() | BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Create a new slab ID.
+        let slab_id = self.next_slab_id;
+        *self.next_slab_id += 1;
+        self.slabs.insert(slab_id, buffer);
+
+        // Create the allocator.
+        let mut allocator = Allocator::new((self.slab_size / aligned_unit_size as u64) as u32);
+
+        // Perform the initial allocation.
+        let allocation = allocator
+            .allocate(aligned_contents_size)
+            .expect("Initial allocation should never fail");
+        let regular_slab_id = class_allocator_data
+            .regular_slabs
+            .insert((allocator, slab_id));
+
+        GpuAllocation {
+            allocation_id: GpuAllocationId::Regular(regular_slab_id, allocation),
+            offset: (allocation.offset as u64 * aligned_unit_size as u64 / unit_size as u64) as u32,
+            slab_id,
+            class_allocator: class_allocator.clone(),
+        }
+    }
+
+    /// Returns true if the given allocation class requires its own slab due to platform limitations.
+    fn class_requires_large_allocation(&self, class: &GpuAllocationClass) -> bool {
+        match *class {
+            GpuAllocationClass::IndexBuffer(_) => false,
+            GpuAllocationClass::VertexBuffer(_) => !self
+                .adapter_downlevel_flags
+                .contains(DownlevelFlags::BASE_VERTEX),
+        }
+    }
+}
+
+impl GpuClassAllocatorData {
+    /// Returns true if all slabs are empty.
+    fn is_empty(&self) -> bool {
+        self.regular_slabs.is_empty() && self.large_slabs.is_empty()
+    }
+}
+
+/// A system that runs every [`SWEEP_INTERVAL`] seconds and returns unused slab
+/// memory to the GPU.
+fn free_unused_slabs(allocator: ResMut<GpuAllocator>) {
+    let allocator = allocator.into_inner();
+    let slab_size = allocator.slab_size;
+
+    // Gather up a list of slabs to delete. We'll delete them all at once after
+    // this.
+    let mut slabs_to_free = vec![];
+    allocator.classes.retain(|_, class| {
+        let Ok(mut class) = class.write() else {
+            return true;
+        };
+
+        // Free regular slabs.
+        class.regular_slabs.retain(|_, (allocator, slab_id)| {
+            // The slab is free if it contains maximal free space.
+            let free = allocator.storage_report().total_free_space as u64 == slab_size;
+            if free {
+                slabs_to_free.push(*slab_id);
+            }
+            !free
+        });
+
+        // Free large slabs.
+        slabs_to_free.append(&mut class.free_large_slabs);
+
+        // If the class is now entirely empty, delete it.
+        !class.is_empty()
+    });
+
+    for slab in slabs_to_free {
+        if let Some(buffer) = allocator.slabs.remove(&slab) {
+            buffer.destroy();
+        }
+    }
+}
+
+impl Display for GpuAllocationClass {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match *self {
+            GpuAllocationClass::VertexBuffer(ref layout) => {
+                let mut hasher = DefaultHasher::new();
+                layout.0.hash(&mut hasher);
+                let hash = hasher.finish();
+                write!(f, "vertex buffer ({:16x})", hash)
+            }
+            GpuAllocationClass::IndexBuffer(IndexFormat::Uint16) => {
+                f.write_str("index buffer (u16)")
+            }
+            GpuAllocationClass::IndexBuffer(IndexFormat::Uint32) => {
+                f.write_str("index buffer (u32)")
+            }
+        }
+    }
+}
+
+impl GpuAllocationClass {
+    /// Returns the `wgpu` [`BufferUsages`] that slabs storing allocations of
+    /// this type must have.
+    fn buffer_usage(&self) -> BufferUsages {
+        match *self {
+            GpuAllocationClass::VertexBuffer(_) => BufferUsages::VERTEX,
+            GpuAllocationClass::IndexBuffer(_) => BufferUsages::INDEX,
+        }
+    }
+}
+
+impl Debug for GpuAllocation {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "{:?} @ {:?}", self.allocation_id, self.slab_id)
+    }
+}
+
+impl Debug for GpuAllocationId {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Regular(slab, allocation) => write!(f, "R({:?}, {})", slab, allocation.offset),
+            Self::Large(slab) => write!(f, "L({:?})", slab),
+        }
+    }
+}
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index bcbcf8b973d48..680b0ebb2082e 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -183,7 +183,7 @@ pub struct IndirectParameters {
     pub instance_count: u32,
 
     /// The index of the first vertex we're to draw.
-    pub first_vertex: u32,
+    pub first_vertex_or_index: u32,
 
     /// For `ArrayIndirectParameters`, `first_instance`; for
     /// `ElementIndirectParameters`, `base_vertex`.
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 60d8ef648aa77..e84254de3d47c 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -12,6 +12,7 @@ compile_error!("bevy_render cannot compile for a 16-bit platform.");
 
 extern crate core;
 
+pub mod allocator;
 pub mod alpha;
 pub mod batching;
 pub mod camera;
@@ -52,6 +53,7 @@ pub mod prelude {
     };
 }
 
+use allocator::GpuAllocatorPlugin;
 use batching::gpu_preprocessing::BatchingPlugin;
 use bevy_ecs::schedule::ScheduleBuildSettings;
 use bevy_utils::prelude::default;
@@ -336,6 +338,7 @@ impl Plugin for RenderPlugin {
             GlobalsPlugin,
             MorphPlugin,
             BatchingPlugin,
+            GpuAllocatorPlugin::default(),
         ));
 
         app.init_resource::<RenderAssetBytesPerFrame>()
diff --git a/crates/bevy_render/src/mesh/mesh/mod.rs b/crates/bevy_render/src/mesh/mesh/mod.rs
index 90e23fbcd8849..9c4e7ecbb308c 100644
--- a/crates/bevy_render/src/mesh/mesh/mod.rs
+++ b/crates/bevy_render/src/mesh/mesh/mod.rs
@@ -5,15 +5,16 @@ use bitflags::bitflags;
 pub use wgpu::PrimitiveTopology;
 
 use crate::{
+    allocator::{GpuAllocation, GpuAllocationClass, GpuAllocator},
     prelude::Image,
     primitives::Aabb,
     render_asset::{PrepareAssetError, RenderAsset, RenderAssetUsages, RenderAssets},
-    render_resource::{Buffer, TextureView, VertexBufferLayout},
-    renderer::RenderDevice,
+    render_resource::{TextureView, VertexBufferLayout},
+    renderer::{RenderDevice, RenderQueue},
     texture::GpuImage,
 };
 use bevy_asset::{Asset, Handle};
-use bevy_derive::EnumVariantMeta;
+use bevy_derive::{Deref, DerefMut, EnumVariantMeta};
 use bevy_ecs::system::{
     lifetimeless::{SRes, SResMut},
     SystemParamItem,
@@ -22,12 +23,9 @@ use bevy_math::*;
 use bevy_reflect::Reflect;
 use bevy_utils::tracing::{error, warn};
 use bytemuck::cast_slice;
-use std::{collections::BTreeMap, hash::Hash, iter::FusedIterator};
+use std::{collections::BTreeMap, hash::Hash, iter::FusedIterator, sync::Arc};
 use thiserror::Error;
-use wgpu::{
-    util::BufferInitDescriptor, BufferUsages, IndexFormat, VertexAttribute, VertexFormat,
-    VertexStepMode,
-};
+use wgpu::{IndexFormat, VertexAttribute, VertexFormat, VertexStepMode};
 
 use super::{MeshVertexBufferLayoutRef, MeshVertexBufferLayouts};
 
@@ -1432,17 +1430,51 @@ impl BaseMeshPipelineKey {
     }
 }
 
+/// A hash of the slab IDs of the vertex and index buffers.
+///
+/// This hash exists to increase the probability that meshes that live in the
+/// same index and vertex buffers are rendered together. Avoiding switching
+/// vertex and index buffers is a significant performance improvement during
+/// rendering.
+///
+/// We pack the bottom 12 bits of each slab ID into the high bits of a 24-bit
+/// field. The bottom 8 bits are reserved for pipeline-key-specific flags.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Deref, DerefMut, Hash, Debug)]
+#[repr(transparent)]
+pub struct MeshSlabHash(pub u32);
+
+impl MeshSlabHash {
+    const VBO_SLAB_ID_SHIFT: u32 = 8;
+    const VBO_SLAB_ID_MASK: u32 = 0xfff << Self::VBO_SLAB_ID_SHIFT;
+    const IBO_SLAB_ID_SHIFT: u32 = 20;
+    const IBO_SLAB_ID_MASK: u32 = 0xfff << Self::IBO_SLAB_ID_SHIFT;
+
+    fn new(vertex_buffer: &GpuAllocation, buffer_info: &GpuBufferInfo) -> MeshSlabHash {
+        // Hash the VBO slab index.
+        let mut hash =
+            (*vertex_buffer.slab_id() << Self::VBO_SLAB_ID_SHIFT) & Self::VBO_SLAB_ID_MASK;
+
+        // Hash the IBO slab index.
+        if let GpuBufferInfo::Indexed { ref allocation, .. } = *buffer_info {
+            hash |= (*allocation.slab_id() << Self::IBO_SLAB_ID_SHIFT) & Self::IBO_SLAB_ID_MASK;
+        }
+
+        MeshSlabHash(hash)
+    }
+}
+
 /// The GPU-representation of a [`Mesh`].
 /// Consists of a vertex data buffer and an optional index data buffer.
 #[derive(Debug, Clone)]
 pub struct GpuMesh {
     /// Contains all attribute data for each vertex.
-    pub vertex_buffer: Buffer,
+    pub vertex_buffer: Arc<GpuAllocation>,
     pub vertex_count: u32,
     pub morph_targets: Option<TextureView>,
     pub buffer_info: GpuBufferInfo,
     pub key_bits: BaseMeshPipelineKey,
     pub layout: MeshVertexBufferLayoutRef,
+    pub slab_hash: MeshSlabHash,
 }
 
 impl GpuMesh {
@@ -1457,7 +1489,7 @@ impl GpuMesh {
 pub enum GpuBufferInfo {
     Indexed {
         /// Contains all index data of a mesh.
-        buffer: Buffer,
+        allocation: Arc<GpuAllocation>,
         count: u32,
         index_format: IndexFormat,
     },
@@ -1468,8 +1500,10 @@ impl RenderAsset for GpuMesh {
     type SourceAsset = Mesh;
     type Param = (
         SRes<RenderDevice>,
+        SRes<RenderQueue>,
         SRes<RenderAssets<GpuImage>>,
         SResMut<MeshVertexBufferLayouts>,
+        SResMut<GpuAllocator>,
     );
 
     #[inline]
@@ -1492,9 +1526,13 @@ impl RenderAsset for GpuMesh {
     /// Converts the extracted mesh a into [`GpuMesh`].
     fn prepare_asset(
         mesh: Self::SourceAsset,
-        (render_device, images, ref mut mesh_vertex_buffer_layouts): &mut SystemParamItem<
-            Self::Param,
-        >,
+        (
+            render_device,
+            render_queue,
+            images,
+            ref mut mesh_vertex_buffer_layouts,
+            ref mut allocator,
+        ): &mut SystemParamItem<Self::Param>,
     ) -> Result<Self, PrepareAssetError<Self::SourceAsset>> {
         let morph_targets = match mesh.morph_targets.as_ref() {
             Some(mt) => {
@@ -1506,20 +1544,25 @@ impl RenderAsset for GpuMesh {
             None => None,
         };
 
+        let mesh_vertex_buffer_layout =
+            mesh.get_mesh_vertex_buffer_layout(mesh_vertex_buffer_layouts);
+
         let vertex_buffer_data = mesh.get_vertex_buffer_data();
-        let vertex_buffer = render_device.create_buffer_with_data(&BufferInitDescriptor {
-            usage: BufferUsages::VERTEX,
-            label: Some("Mesh Vertex Buffer"),
-            contents: &vertex_buffer_data,
-        });
+        let vertex_buffer = allocator.allocate_with(
+            render_device,
+            render_queue,
+            &GpuAllocationClass::VertexBuffer(mesh_vertex_buffer_layout.clone()),
+            bytemuck::cast_slice(&vertex_buffer_data),
+        );
 
         let buffer_info = if let Some(data) = mesh.get_index_buffer_bytes() {
             GpuBufferInfo::Indexed {
-                buffer: render_device.create_buffer_with_data(&BufferInitDescriptor {
-                    usage: BufferUsages::INDEX,
-                    contents: data,
-                    label: Some("Mesh Index Buffer"),
-                }),
+                allocation: Arc::new(allocator.allocate_with(
+                    render_device,
+                    render_queue,
+                    &GpuAllocationClass::IndexBuffer(mesh.indices().unwrap().into()),
+                    bytemuck::cast_slice(data),
+                )),
                 count: mesh.indices().unwrap().len() as u32,
                 index_format: mesh.indices().unwrap().into(),
             }
@@ -1527,22 +1570,22 @@ impl RenderAsset for GpuMesh {
             GpuBufferInfo::NonIndexed
         };
 
-        let mesh_vertex_buffer_layout =
-            mesh.get_mesh_vertex_buffer_layout(mesh_vertex_buffer_layouts);
-
         let mut key_bits = BaseMeshPipelineKey::from_primitive_topology(mesh.primitive_topology());
         key_bits.set(
             BaseMeshPipelineKey::MORPH_TARGETS,
             mesh.morph_targets.is_some(),
         );
 
+        let slab_hash = MeshSlabHash::new(&vertex_buffer, &buffer_info);
+
         Ok(GpuMesh {
-            vertex_buffer,
+            vertex_buffer: Arc::new(vertex_buffer),
             vertex_count: mesh.count_vertices() as u32,
             buffer_info,
             key_bits,
             layout: mesh_vertex_buffer_layout,
             morph_targets,
+            slab_hash,
         })
     }
 }
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 0dec7f0cbdac5..9e825609b594e 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -11,6 +11,7 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Vec4};
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
+use bevy_render::allocator::GpuAllocator;
 use bevy_render::batching::no_gpu_preprocessing::{
     self, batch_and_prepare_sorted_render_phase, write_batched_instance_buffer,
     BatchedInstanceBuffer,
@@ -661,7 +662,11 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMesh2dBindGroup<I> {
 
 pub struct DrawMesh2d;
 impl<P: PhaseItem> RenderCommand<P> for DrawMesh2d {
-    type Param = (SRes<RenderAssets<GpuMesh>>, SRes<RenderMesh2dInstances>);
+    type Param = (
+        SRes<RenderAssets<GpuMesh>>,
+        SRes<RenderMesh2dInstances>,
+        SRes<GpuAllocator>,
+    );
     type ViewQuery = ();
     type ItemQuery = ();
 
@@ -670,11 +675,12 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh2d {
         item: &P,
         _view: (),
         _item_query: Option<()>,
-        (meshes, render_mesh2d_instances): SystemParamItem<'w, '_, Self::Param>,
+        (meshes, render_mesh2d_instances, allocator): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         let meshes = meshes.into_inner();
         let render_mesh2d_instances = render_mesh2d_instances.into_inner();
+        let allocator = allocator.into_inner();
 
         let Some(RenderMesh2dInstance { mesh_asset_id, .. }) =
             render_mesh2d_instances.get(&item.entity())
@@ -685,20 +691,33 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh2d {
             return RenderCommandResult::Failure;
         };
 
-        pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
+        let vertex_buffer = allocator.buffer(&gpu_mesh.vertex_buffer);
+        let vertex_offset = gpu_mesh.vertex_buffer.offset();
+
+        pass.set_vertex_buffer(0, vertex_buffer.slice(..));
 
         let batch_range = item.batch_range();
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
-                buffer,
+                ref allocation,
                 index_format,
                 count,
             } => {
-                pass.set_index_buffer(buffer.slice(..), 0, *index_format);
-                pass.draw_indexed(0..*count, 0, batch_range.clone());
+                let index_buffer = allocator.buffer(allocation);
+                let index_offset = allocation.offset();
+
+                pass.set_index_buffer(index_buffer.slice(..), vertex_offset as u64, *index_format);
+                pass.draw_indexed(
+                    index_offset..(index_offset + *count),
+                    0,
+                    batch_range.clone(),
+                );
             }
             GpuBufferInfo::NonIndexed => {
-                pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                pass.draw(
+                    vertex_offset..(vertex_offset + gpu_mesh.vertex_count),
+                    batch_range.clone(),
+                );
             }
         }
         RenderCommandResult::Success
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index 17fd7823c95ad..6d65680ba67a8 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -11,6 +11,7 @@ use bevy::{
     },
     prelude::*,
     render::{
+        allocator::GpuAllocator,
         extract_component::{ExtractComponent, ExtractComponentPlugin},
         mesh::{GpuBufferInfo, GpuMesh, MeshVertexBufferLayoutRef},
         render_asset::RenderAssets,
@@ -233,7 +234,11 @@ type DrawCustom = (
 struct DrawMeshInstanced;
 
 impl<P: PhaseItem> RenderCommand<P> for DrawMeshInstanced {
-    type Param = (SRes<RenderAssets<GpuMesh>>, SRes<RenderMeshInstances>);
+    type Param = (
+        SRes<RenderAssets<GpuMesh>>,
+        SRes<RenderMeshInstances>,
+        SRes<GpuAllocator>,
+    );
     type ViewQuery = ();
     type ItemQuery = Read<InstanceBuffer>;
 
@@ -242,7 +247,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMeshInstanced {
         item: &P,
         _view: (),
         instance_buffer: Option<&'w InstanceBuffer>,
-        (meshes, render_mesh_instances): SystemParamItem<'w, '_, Self::Param>,
+        (meshes, render_mesh_instances, allocator): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(item.entity())
@@ -255,21 +260,35 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMeshInstanced {
         let Some(instance_buffer) = instance_buffer else {
             return RenderCommandResult::Failure;
         };
+        let allocator = allocator.into_inner();
 
-        pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
+        let vertex_buffer = allocator.buffer(&gpu_mesh.vertex_buffer);
+        let vertex_offset = gpu_mesh.vertex_buffer.offset();
+
+        pass.set_vertex_buffer(0, vertex_buffer.slice(..));
         pass.set_vertex_buffer(1, instance_buffer.buffer.slice(..));
 
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
-                buffer,
+                allocation,
                 index_format,
                 count,
             } => {
-                pass.set_index_buffer(buffer.slice(..), 0, *index_format);
-                pass.draw_indexed(0..*count, 0, 0..instance_buffer.length as u32);
+                let index_buffer = allocator.buffer(allocation);
+                let index_offset = allocation.offset();
+
+                pass.set_index_buffer(index_buffer.slice(..), 0, *index_format);
+                pass.draw_indexed(
+                    index_offset..(index_offset + *count),
+                    vertex_offset as i32,
+                    0..instance_buffer.length as u32,
+                );
             }
             GpuBufferInfo::NonIndexed => {
-                pass.draw(0..gpu_mesh.vertex_count, 0..instance_buffer.length as u32);
+                pass.draw(
+                    vertex_offset..(vertex_offset + gpu_mesh.vertex_count),
+                    0..instance_buffer.length as u32,
+                );
             }
         }
         RenderCommandResult::Success

From 3a20a1e7eaac051d61593ad3b19ccb154f181804 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 16 May 2024 22:47:12 -0700
Subject: [PATCH 2/4] Take vertex offset into account for 2D meshes

---
 crates/bevy_sprite/src/mesh2d/mesh.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 9e825609b594e..b5648eb237e23 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -709,7 +709,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh2d {
                 pass.set_index_buffer(index_buffer.slice(..), vertex_offset as u64, *index_format);
                 pass.draw_indexed(
                     index_offset..(index_offset + *count),
-                    0,
+                    vertex_offset as i32,
                     batch_range.clone(),
                 );
             }

From 9b8180293970fc2992d723f8419d600d6861e5c9 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 16 May 2024 22:50:10 -0700
Subject: [PATCH 3/4] Address review comment

---
 crates/bevy_render/src/allocator.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_render/src/allocator.rs b/crates/bevy_render/src/allocator.rs
index b7642fd405ea5..c2c933d68bbe9 100644
--- a/crates/bevy_render/src/allocator.rs
+++ b/crates/bevy_render/src/allocator.rs
@@ -60,7 +60,7 @@ pub struct GpuAllocatorPlugin {
 }
 
 /// Manages allocations in GPU buffers.
-#[derive(Resource, Clone)]
+#[derive(Resource)]
 pub struct GpuAllocator {
     slabs: HashMap<SlabId, Buffer>,
     slab_size: u64,

From 9742520af8d0aea7ee7da8c71daa5962ec80a3db Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 20 May 2024 19:14:02 -0700
Subject: [PATCH 4/4] Address review comments

---
 crates/bevy_render/src/allocator.rs | 31 +++++++++++++----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/crates/bevy_render/src/allocator.rs b/crates/bevy_render/src/allocator.rs
index c2c933d68bbe9..1944a7bdee0a9 100644
--- a/crates/bevy_render/src/allocator.rs
+++ b/crates/bevy_render/src/allocator.rs
@@ -254,11 +254,7 @@ impl GpuAllocationClass {
     /// This exists because copies in `wgpu` must begin and end on 4-byte
     /// boundaries, so we have to pad out allocations appropriately.
     fn aligned_unit_size(&self) -> u32 {
-        let mut unit_size = self.unit_size();
-        if unit_size % 4 != 0 {
-            unit_size += 4 - unit_size % 4;
-        }
-        unit_size
+        self.unit_size().next_multiple_of(4)
     }
 }
 
@@ -339,17 +335,18 @@ impl GpuAllocator {
         // necessary. (It's unfortunate that we incur a copy in that case…)
         let buffer = &self.slabs[&allocation.slab_id];
         let byte_offset = allocation.offset() as u64 * class.unit_size() as u64;
-        if contents.len() % 4 == 0 {
-            render_queue.write_buffer(buffer, byte_offset, contents);
-        } else {
-            let contents = contents
-                .iter()
-                .copied()
-                .chain(iter::repeat(0).take(4 - (contents.len() % 4)))
-                .collect::<Vec<_>>();
-
-            render_queue.write_buffer(buffer, byte_offset, &contents);
-        };
+        match contents.len() % 4 {
+            0 => render_queue.write_buffer(buffer, byte_offset, contents),
+            remainder => {
+                let contents = contents
+                    .iter()
+                    .copied()
+                    .chain(iter::repeat(0).take(4 - remainder))
+                    .collect::<Vec<_>>();
+
+                render_queue.write_buffer(buffer, byte_offset, &contents);
+            }
+        }
 
         allocation
     }
@@ -375,7 +372,7 @@ impl GpuAllocator {
         let mut slab_id = None;
         for slab_index in 0..class_allocator_data.free_large_slabs.len() {
             if self.slabs[&class_allocator_data.free_large_slabs[slab_index]].size()
-                >= contents.len() as u64 * 4
+                >= contents.len() as u64
             {
                 slab_id = Some(
                     class_allocator_data