Skip to content

Commit

Permalink
test_suite: patch, don't replace, the tenant_config field, where ap…
Browse files Browse the repository at this point in the history
…propriate (#7771)

Before this PR, the changed tests would overwrite the entire
`tenant_config` because `pageserver_config_override` is merged
non-recursively into the `ps_cfg`.

This meant they would override the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our
matrix build for `compaction_algorithm=Tiered|Legacy` in
#7748.

I found the tests fixed in this PR using the
`NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that
I added in #7748. Therefore, I think this is an exhaustive fix. This is
better than just searching the code base for `tenant_config`, which is
what I had sketched in #7747.

refs #7749
  • Loading branch information
problame authored May 17, 2024
1 parent 4b8809b commit 6d951e6
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 46 deletions.
14 changes: 9 additions & 5 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ def __init__(
test_overlay_dir: Optional[Path] = None,
pageserver_remote_storage: Optional[RemoteStorage] = None,
# toml that will be decomposed into `--config-override` flags during `pageserver --init`
pageserver_config_override: Optional[str] = None,
pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
num_safekeepers: int = 1,
num_pageservers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
Expand Down Expand Up @@ -1127,10 +1127,14 @@ def __init__(self, config: NeonEnvBuilder):
)

if config.pageserver_config_override is not None:
for o in config.pageserver_config_override.split(";"):
override = toml.loads(o)
for key, value in override.items():
ps_cfg[key] = value
if callable(config.pageserver_config_override):
config.pageserver_config_override(ps_cfg)
else:
assert isinstance(config.pageserver_config_override, str)
for o in config.pageserver_config_override.split(";"):
override = toml.loads(o)
for key, value in override.items():
ps_cfg[key] = value

# Create a corresponding NeonPageserver object
self.pageservers.append(
Expand Down
3 changes: 1 addition & 2 deletions test_runner/regress/test_branch_behind.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
#
def test_branch_behind(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})

error_regexes = [
".*invalid branch start lsn.*",
Expand Down
9 changes: 3 additions & 6 deletions test_runner/regress/test_gc_aggressive.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ async def update_table(endpoint: Endpoint):
#
def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
endpoint = env.endpoints.create_start("test_gc_aggressive")

Expand All @@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):

#
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
num_index_uploads = 0

neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

env = neon_env_builder.init_start()
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
endpoint = env.endpoints.create_start("test_gc_index_upload")
Expand Down
3 changes: 1 addition & 2 deletions test_runner/regress/test_old_request_lsn.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
#
def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
env.neon_cli.create_branch("test_old_request_lsn", "main")
endpoint = env.endpoints.create_start("test_old_request_lsn")

Expand Down
2 changes: 2 additions & 0 deletions test_runner/regress/test_pageserver_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def run_pageserver(args):
"listen_http_addr",
"pg_auth_type",
"http_auth_type",
# TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
# "tenant_config",
]
required_config_overrides = [
f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
Expand Down
6 changes: 2 additions & 4 deletions test_runner/regress/test_pitr_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@
#
def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
# Set pitr interval such that we need to keep the data
neon_env_builder.pageserver_config_override = (
"tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
env = neon_env_builder.init_start(
initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
)

env = neon_env_builder.init_start()
endpoint_main = env.endpoints.create_start("main")

main_pg_conn = endpoint_main.connect()
Expand Down
8 changes: 5 additions & 3 deletions test_runner/regress/test_recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
#
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"

env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "1048576",
}
)
env.pageserver.is_testing_enabled_or_skip()

# We expect the pageserver to exit, which will cause storage storage controller
Expand Down
27 changes: 17 additions & 10 deletions test_runner/regress/test_tenant_conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from contextlib import closing
from typing import Any, Dict

import psycopg2.extras
from fixtures.common_types import Lsn
Expand All @@ -14,16 +15,22 @@

def test_tenant_config(neon_env_builder: NeonEnvBuilder):
"""Test per tenant configuration"""
# set some non-default global config
neon_env_builder.pageserver_config_override = """
page_cache_size=444;
wait_lsn_timeout='111 s';
[tenant_config]
checkpoint_distance = 10000
compaction_target_size = 1048576
evictions_low_residence_duration_metric_threshold = "2 days"
eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
"""

def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
ps_cfg["page_cache_size"] = 444
ps_cfg["wait_lsn_timeout"] = "111 s"

tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["checkpoint_distance"] = 10000
tenant_config["compaction_target_size"] = 1048576
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
tenant_config["eviction_policy"] = {
"kind": "LayerAccessThreshold",
"period": "20s",
"threshold": "23 hours",
}

neon_env_builder.pageserver_config_override = set_some_nondefault_global_config

env = neon_env_builder.init_start()
# we configure eviction but no remote storage, there might be error lines
Expand Down
11 changes: 8 additions & 3 deletions test_runner/regress/test_tenant_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(

gc_horizon = 128 * 1024

neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"

env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"compaction_period": "0s",
"gc_period": "0s",
"pitr_interval": "0sec",
"gc_horizon": gc_horizon,
}
)

# FIXME: we have a race condition between GC and delete timeline. GC might fail with this
# error. Similar to https://github.com/neondatabase/neon/issues/2671
Expand Down
20 changes: 13 additions & 7 deletions test_runner/regress/test_timeline_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder

# Disable background compaction as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
neon_env_builder.pageserver_config_override = (
"tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "100000",
"compaction_period": "10m",
}
)

env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
Expand Down Expand Up @@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):

# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"

env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "100000",
"compaction_period": "0s",
"gc_period": "0s",
"pitr_interval": "1s",
}
)
pageserver_http = env.pageserver.http_client()

new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
Expand Down
13 changes: 9 additions & 4 deletions test_runner/regress/test_wal_receiver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from typing import Any, Dict

from fixtures.common_types import Lsn, TenantId
from fixtures.log_helper import log
Expand Down Expand Up @@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = """
wait_lsn_timeout = "1s"
tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
"""
def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
ps_cfg["wait_lsn_timeout"] = "1s"
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["walreceiver_connect_timeout"] = "2s"
tenant_config["lagging_wal_timeout"] = "2s"

neon_env_builder.pageserver_config_override = customize_pageserver_toml

# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
neon_env_builder.safekeepers_id_start = 12345
neon_env_builder.num_safekeepers = 3
Expand Down

1 comment on commit 6d951e6

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3153 tests run: 3013 passed, 0 failed, 140 skipped (full report)


Flaky tests (2)

Postgres 16

  • test_hot_standby_gc: debug

Postgres 14

  • test_empty_branch_remote_storage_upload: debug

Code coverage* (full report)

  • functions: 31.4% (6349 of 20190 functions)
  • lines: 47.4% (47938 of 101054 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
6d951e6 at 2024-05-17T11:45:08.329Z :recycle:

Please sign in to comment.