From 5c2284c8dbe837b217ae3bef7003c2eff79262c8 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 2 Oct 2024 13:48:44 -0400 Subject: [PATCH 1/2] feat(pageserver): do space check before gc-compaction Signed-off-by: Alex Chi Z --- Cargo.lock | 11 ++++++ Cargo.toml | 1 + pageserver/Cargo.toml | 1 + pageserver/src/tenant/storage_layer/layer.rs | 4 +++ pageserver/src/tenant/timeline/compaction.rs | 35 ++++++++++++++++++++ 5 files changed, 52 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 7e772814ec57..9248235ef8c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2059,6 +2059,16 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -3601,6 +3611,7 @@ dependencies = [ "enum-map", "enumset", "fail", + "fs2", "futures", "hex", "hex-literal", diff --git a/Cargo.toml b/Cargo.toml index a1a974b33b48..de3d1cf09df6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,6 +83,7 @@ enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } +fs2 = "0.4" futures = "0.3" futures-core = "0.3" futures-util = "0.3" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 2531abc7a187..713ad9ef9d14 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -26,6 +26,7 @@ consumption_metrics.workspace = true crc32c.workspace = true either.workspace = true fail.workspace = true +fs2.workspace = true futures.workspace = true hex.workspace = true humantime.workspace = true diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index bbb21b180ed5..f29a33bae6de 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -341,6 +341,10 @@ impl Layer { Ok(()) } + pub(crate) async fn needs_download(&self) -> Result, std::io::Error> { + self.0.needs_download().await + } + /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8b9ace1e5bbf..174d96668fe8 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1691,6 +1691,39 @@ impl Timeline { unreachable!("key retention is empty") } + /// Check how much space is left on the disk + async fn check_available_space(self: &Arc) -> anyhow::Result { + let base_path = self.conf.tenants_path(); + fs2::free_space(base_path).context("fs2::free_space") + } + + /// Check if the compaction can proceed safely without running out of space. We assume the size + /// upper bound of the produced files of a compaction job is the same as all layers involved in + /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a + /// compaction. + async fn check_compaction_space( + self: &Arc, + layer_selection: &[Layer], + ) -> anyhow::Result<()> { + let available_space = self.check_available_space().await?; + let mut remote_layer_size = 0; + let mut all_layer_size = 0; + for layer in layer_selection { + let needs_download = layer.needs_download().await?; + if needs_download.is_some() { + remote_layer_size += layer.layer_desc().file_size; + } + all_layer_size += layer.layer_desc().file_size; + } + let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ + if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space + { + return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + } + Ok(()) + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -1806,6 +1839,8 @@ impl Timeline { lowest_retain_lsn ); + self.check_compaction_space(&layer_selection).await?; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) From ccda7d7031f21b2ec3688a98bbbfc2d0e3eb90d3 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Tue, 15 Oct 2024 16:49:03 -0400 Subject: [PATCH 2/2] switch to statvfs Signed-off-by: Alex Chi Z --- Cargo.lock | 11 ----------- Cargo.toml | 1 - pageserver/Cargo.toml | 1 - pageserver/src/disk_usage_eviction_task.rs | 11 +---------- pageserver/src/statvfs.rs | 16 ++++++++++++++++ pageserver/src/tenant/timeline/compaction.rs | 11 +++++++++-- 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9248235ef8c5..7e772814ec57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2059,16 +2059,6 @@ dependencies = [ "tokio-util", ] -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "fsevent-sys" version = "4.1.0" @@ -3611,7 +3601,6 @@ dependencies = [ "enum-map", "enumset", "fail", - "fs2", "futures", "hex", "hex-literal", diff --git a/Cargo.toml b/Cargo.toml index de3d1cf09df6..a1a974b33b48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,7 +83,6 @@ enumset = "1.0.12" fail = "0.5.0" fallible-iterator = "0.2" framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } -fs2 = "0.4" futures = "0.3" futures-core = "0.3" futures-util = "0.3" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 713ad9ef9d14..2531abc7a187 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -26,7 +26,6 @@ consumption_metrics.workspace = true crc32c.workspace = true either.workspace = true fail.workspace = true -fs2.workspace = true futures.workspace = true hex.workspace = true humantime.workspace = true diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a58fa2c0b1e1..7ab2ba87420f 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -1218,16 +1218,7 @@ mod filesystem_level_usage { let stat = Statvfs::get(tenants_dir, mock_config) .context("statvfs failed, presumably directory got unlinked")?; - // https://unix.stackexchange.com/a/703650 - let blocksize = if stat.fragment_size() > 0 { - stat.fragment_size() - } else { - stat.block_size() - }; - - // use blocks_available (b_avail) since, pageserver runs as unprivileged user - let avail_bytes = stat.blocks_available() * blocksize; - let total_bytes = stat.blocks() * blocksize; + let (avail_bytes, total_bytes) = stat.get_avail_total_bytes(); Ok(Usage { config, diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 5a6f6e5176ac..205605bc86a8 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -53,6 +53,22 @@ impl Statvfs { Statvfs::Mock(stat) => stat.block_size, } } + + /// Get the available and total bytes on the filesystem. + pub fn get_avail_total_bytes(&self) -> (u64, u64) { + // https://unix.stackexchange.com/a/703650 + let blocksize = if self.fragment_size() > 0 { + self.fragment_size() + } else { + self.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = self.blocks_available() * blocksize; + let total_bytes = self.blocks() * blocksize; + + (avail_bytes, total_bytes) + } } pub mod mock { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 174d96668fe8..5588363330d0 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::filter_iterator::FilterIterator; @@ -1693,8 +1694,14 @@ impl Timeline { /// Check how much space is left on the disk async fn check_available_space(self: &Arc) -> anyhow::Result { - let base_path = self.conf.tenants_path(); - fs2::free_space(base_path).context("fs2::free_space") + let tenants_dir = self.conf.tenants_path(); + + let stat = Statvfs::get(&tenants_dir, None) + .context("statvfs failed, presumably directory got unlinked")?; + + let (avail_bytes, _) = stat.get_avail_total_bytes(); + + Ok(avail_bytes) } /// Check if the compaction can proceed safely without running out of space. We assume the size