From 71288f1524714b8eaf3e1cb03e9a49960311a793 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Nov 2024 12:46:45 +0000 Subject: [PATCH 1/4] pageserver: refuse to load too-old indexes --- libs/remote_storage/src/error.rs | 8 +++++++- pageserver/src/tenant.rs | 6 ++++++ pageserver/src/tenant/remote_timeline_client.rs | 8 +++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 17790e9f70df..1119b72a7489 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + /// Reasons for downloads or listings to fail. #[derive(Debug)] pub enum DownloadError { @@ -15,6 +17,9 @@ pub enum DownloadError { /// /// Concurrency control is not timed within timeout. Timeout, + /// Some integrity/consistency check failed during download. This is used during + /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption. + Fatal(Cow<'static, str>), /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } @@ -29,6 +34,7 @@ impl std::fmt::Display for DownloadError { DownloadError::Unmodified => write!(f, "File was not modified"), DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } } @@ -41,7 +47,7 @@ impl DownloadError { pub fn is_permanent(&self) -> bool { use DownloadError::*; match self { - BadInput(_) | NotFound | Unmodified | Cancelled => true, + BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true, Timeout | Other(_) => false, } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 903174680e00..cb079ccbfa4c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1433,6 +1433,12 @@ impl Tenant { info!(%timeline_id, "index_part not found on remote"); continue; } + Err(DownloadError::Fatal(why)) => { + // If, while loading one remote timeline, we saw an indication that our generation + // number is likely invalid, then we should not load the whole tenant. + error!(%timeline_id, "Fatal error loading timeline: {why}"); + anyhow::bail!(why.to_string()); + } Err(e) => { // Some (possibly ephemeral) error happened during index_part download. // Pretend the timeline exists to not delete the timeline directory, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index b37c16e133fd..600583f6b50f 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -574,12 +574,18 @@ impl RemoteTimelineClient { if latest_index_generation > index_generation { // Unexpected! Why are we loading such an old index if a more recent one exists? - tracing::warn!( + // We will refuse to proceed, as there is no reasonable scenario where this should happen, but + // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation + // backwards). + tracing::error!( ?index_generation, ?latest_index_generation, ?latest_index_mtime, "Found a newer index while loading an old one" ); + return Err(DownloadError::Fatal( + "Index age exceeds threshold and a newer index exists".into(), + )); } } From a94ec281bd69ccafd52c4adcb3220358bb530b2e Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Nov 2024 17:01:53 +0000 Subject: [PATCH 2/4] tests: add test_old_index_time_threshold --- .../regress/test_pageserver_generations.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 8f6c9f16fd6f..bc753e76fc06 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -34,7 +34,9 @@ wait_for_last_record_lsn, wait_for_upload, ) +from fixtures.pg_version import run_only_on_default_postgres from fixtures.remote_storage import ( + LocalFsStorage, RemoteStorageKind, ) from fixtures.utils import wait_until @@ -728,3 +730,68 @@ def test_upgrade_generationless_local_file_paths( ) # We should download into the same local path we started with assert os.path.exists(victim_path) + + +@run_only_on_default_postgres("Only tests index logic") +def test_old_index_time_threshold( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise pageserver's detection of trying to load an ancient non-latest index. + (see https://github.com/neondatabase/neon/issues/6951) + """ + + # Run with local_fs because we will interfere with mtimes by local filesystem access + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(32) + + # Remember generation 1's index path + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id) + + # Increment generation by detaching+attaching, and write+flush some data to get a new remote index + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + workload.churn_rows(32) + + # A new index should have been written + assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path + + # Hack the mtime on the generation 1 index + log.info(f"Setting old mtime on {index_path}") + os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600)) + env.pageserver.allowed_errors.extend( + [ + ".*Found a newer index while loading an old one.*", + ".*Index age exceeds threshold and a newer index exists.*", + ] + ) + + # Detach from storage controller + attach in an old generation directly on the pageserver. + workload.stop() + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy") + + # The controller would not do this (attach in an old generation): we are doing it to simulate + # a hypothetical profound bug in the controller. + env.pageserver.http_client().tenant_location_conf( + tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}} + ) + + # The pageserver should react to this situation by refusing to attach the tenant and putting + # it into Broken state + env.pageserver.allowed_errors.append(".*tenant is broken.*") + with pytest.raises( + PageserverApiException, + match="tenant is broken: Index age exceeds threshold and a newer index exists", + ): + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id) From 42e943ed1bdbb383e3b8097f12eb0b6fe0fec31d Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 13 Nov 2024 13:11:02 +0000 Subject: [PATCH 3/4] avoid spurious cow --- libs/remote_storage/src/error.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 1119b72a7489..ec9f86899868 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - /// Reasons for downloads or listings to fail. #[derive(Debug)] pub enum DownloadError { @@ -19,7 +17,7 @@ pub enum DownloadError { Timeout, /// Some integrity/consistency check failed during download. This is used during /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption. - Fatal(Cow<'static, str>), + Fatal(String), /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } From e33af1b1faa5d3ca8ae767bfada8d80fa6a418c9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 13 Nov 2024 15:38:36 +0000 Subject: [PATCH 4/4] fix import for moved python type --- test_runner/regress/test_pageserver_generations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index bc753e76fc06..4f59efb8b3a9 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -34,12 +34,11 @@ wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.pg_version import run_only_on_default_postgres from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload if TYPE_CHECKING: