From 7906583966c5edb9d80b4f17670070e7946d65d0 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Wed, 1 Dec 2021 09:15:29 -0800 Subject: [PATCH] Fix zettacache allocation cursor on startup (#17) On startup, the allocation cursor for each of the buckets within the zettacache appears to be set incorrectly. Rather than the cursor being set to the most free slab in each bucket, we inadvertently set the cursor to the last slab that is loaded from disk, for each bucket. This means that on start up, if the last slab loaded for a bucket happens to be full and at the end of the sorted slab list, a new slab will be converted on the next allocation, rather than satisfying that allocation from a partially allocated slab already in the bucket. Further, it means that if that new slab is filled with new allocations within a single checkpoint, allocation failures will result, even if the bucket has lots of free space available. This error is easy to make, because SlabAllocationBuckets can be in a half-constructed state, where it's SortedSlabs don't have all the slabs, and their last_allocated is bogus The fix implemented here, is to no longer allow a half-constructed SlabAllocationBuckets; instead, its new() method now takes sufficient parameters to get it into a valid state, that accurately represents what's on disk, before the object is returned to the caller. --- .../zettacache/src/block_allocator.rs | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/cmd/zfs_object_agent/zettacache/src/block_allocator.rs b/cmd/zfs_object_agent/zettacache/src/block_allocator.rs index dbec298c4e1e..df24ed3510af 100644 --- a/cmd/zfs_object_agent/zettacache/src/block_allocator.rs +++ b/cmd/zfs_object_agent/zettacache/src/block_allocator.rs @@ -13,7 +13,7 @@ use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::ops::Bound::*; use std::sync::Arc; use std::time::Instant; -use std::{fmt, iter, mem}; +use std::{fmt, mem}; use util::BitmapRangeIterator; use util::RangeTree; use util::{get_tunable, TerseVec}; @@ -770,19 +770,22 @@ impl SortedSlabs { struct SlabAllocationBuckets(BTreeMap); impl SlabAllocationBuckets { - fn new(phys: SlabAllocationBucketsPhys) -> Self { + fn new( + phys: SlabAllocationBucketsPhys, + mut slabs: BTreeMap>, + ) -> Self { let mut buckets = BTreeMap::new(); for (max_size, is_extent_based) in phys.buckets { - buckets.insert(max_size, SortedSlabs::new(is_extent_based, iter::empty())); + buckets.insert( + max_size, + SortedSlabs::new(is_extent_based, slabs.remove(&max_size).unwrap_or_default()), + ); } - SlabAllocationBuckets(buckets) - } - fn add_slab_to_bucket(&mut self, bucket: u32, slab: &Slab) { - self.0 - .get_mut(&bucket) - .unwrap() - .insert(slab.to_sorted_slab_entry()); + // We expect for all slabs passed in to be consumed and added to a bucket. + assert!(slabs.is_empty()); + + SlabAllocationBuckets(buckets) } fn get_bucket_for_allocation_size(&mut self, request_size: u32) -> (&u32, &mut SortedSlabs) { @@ -986,11 +989,15 @@ impl BlockAllocator { let mut available_space = 0u64; let mut free_slabs = Vec::new(); - let mut slab_buckets = SlabAllocationBuckets::new(phys.slab_buckets); + let mut slabs_by_bucket: BTreeMap> = BTreeMap::new(); + for slab in slabs.0.iter() { match &slab.info { SlabType::BitmapBased(_) | SlabType::ExtentBased(_) => { - slab_buckets.add_slab_to_bucket(slab.get_max_size(), slab); + slabs_by_bucket + .entry(slab.get_max_size()) + .or_default() + .push(slab.to_sorted_slab_entry()); available_space += slab.get_free_space(); } SlabType::Free(_) => { @@ -1000,6 +1007,8 @@ impl BlockAllocator { } } + let slab_buckets = SlabAllocationBuckets::new(phys.slab_buckets, slabs_by_bucket); + info!( "loaded BlockAllocator metadata in {}ms", begin.elapsed().as_millis()