Skip to content

Commit

Permalink
Another attempt at making test_vm_bits less flaky
Browse files Browse the repository at this point in the history
- Split the first and second parts of the test to two separate tests

- In the first test, disable the aggressive GC, compaction, and
  autovacuum. They are only needed by the second test. I'd like to get
  the first test to a point that the VM page is never all-zeros.
  Disabling autovacuum in the first test is hopefully enough
  to accomplish that.

- Compare the full page images, don't skip page header. After fixing
  the previous point, there should be no discrepancy. LSN still won't
  match, though, because of commit 387a368.

Fixes issue #7984
  • Loading branch information
hlinnaka committed Jun 6, 2024
1 parent d46d194 commit cb2a7ec
Showing 1 changed file with 63 additions and 26 deletions.
89 changes: 63 additions & 26 deletions test_runner/regress/test_vm_bits.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,67 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
# record.
#
def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
# This is a repro for the bug fixed in commit 66fa176cc8.
#
def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()

endpoint = env.endpoints.create_start(
"main",
config_lines=[
# If auto-analyze runs at the same time that we run VACUUM FREEZE, it
# can hold a snasphot that prevent the tuples from being frozen.
"autovacuum=off",
"log_checkpoints=on"
],
)

pg_conn = endpoint.connect()
cur = pg_conn.cursor()

# Install extension containing function needed for test
cur.execute("CREATE EXTENSION neon_test_utils")
cur.execute("CREATE EXTENSION pageinspect")

# Create a test table and freeze it to set the all-frozen VM bit on all pages.
cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock")
for notice in pg_conn.notices:
log.info(f"{notice}")

# Lock a row. This clears the all-frozen VM bit for that page.
cur.execute("BEGIN")
cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
cur.execute("COMMIT")

# The VM page in shared buffer cache, and the same page as reconstructed by
# the pageserver, should be equal. Except for the LSN: Clearing a bit in the
# VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN
# when it replays the VM-bit clearing record (since commit 387a36874c)
#
# This is a bit fragile, we've had lot of flakiness in this test before. For
# example, because all the VM bits were not set because concurrent
# autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or
# because autoavacuum kicked in and re-froze the page between the
# get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now,
# which should make this deterministic.
cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
cur.execute(
"select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
)
vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()

assert vm_page_at_pageserver == vm_page_in_cache


#
# The previous test is enough to verify the bug that was fixed in
# commit 66fa176cc8. But for good measure, we also reproduce the
# original problem that the missing VM page update caused.
#
def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
tenant_conf = {
"checkpoint_distance": f"{128 * 1024}",
"compaction_target_size": f"{128 * 1024}",
Expand All @@ -130,9 +190,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)

tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start(
"test_vm_bit_clear_on_heap_lock",
"main",
config_lines=[
"log_autovacuum_min_duration = 0",
# Perform anti-wraparound vacuuming aggressively
Expand All @@ -146,12 +206,10 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):

# Install extension containing function needed for test
cur.execute("CREATE EXTENSION neon_test_utils")
cur.execute("CREATE EXTENSION pageinspect")

# Create a test table and freeze it to set the all-frozen VM bit on all pages.
cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")

cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")

# Lock a row. This clears the all-frozen VM bit for that page.
Expand All @@ -165,27 +223,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):

cur.execute("COMMIT")

# The VM page in shared buffer cache, and the same page as reconstructed
# by the pageserver, should be equal.
#
# Ignore page header (24 bytes) of visibility map.
# If the dirty VM page is flushed from the cache for some reason,
# it gets WAL-logged, which changes the LSN on the page.
# Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
cur.execute(
"select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
)
vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()

assert vm_page_at_pageserver == vm_page_in_cache

# The above assert is enough to verify the bug that was fixed in
# commit 66fa176cc8. But for good measure, we also reproduce the
# original problem that the missing VM page update caused. The
# rest of the test does that.

# Kill and restart postgres, to clear the buffer cache.
#
# NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
Expand Down

0 comments on commit cb2a7ec

Please sign in to comment.