From d180abc6c32cb44a6c91fb66d5a5d12b88d823d3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 10 Oct 2024 09:03:35 +0300 Subject: [PATCH 01/11] Correctly truncate VM --- libs/postgres_ffi/src/pg_constants.rs | 7 ++++-- pageserver/src/walingest.rs | 31 ++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 497d011d7a20..e343473d7732 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -243,8 +243,11 @@ const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_P pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; /* From visibilitymap.c */ -pub const VM_HEAPBLOCKS_PER_PAGE: u32 = - (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA; +pub const VM_BITS_PER_HEAPBLOCK: usize = 2; +pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK; +pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE; /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 84353970b78b..0720bdede24b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -621,11 +621,32 @@ impl WalIngest { forknum: VISIBILITYMAP_FORKNUM, }; - let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE; - if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { - // Tail of last remaining vm page has to be zeroed. - // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, vm_page_no)?; + let mut vm_page_no = rec.blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); + let trunc_byte = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE + / pg_constants::VM_HEAPBLOCKS_PER_BYTE; + let trunc_offset = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE + * pg_constants::VM_BITS_PER_HEAPBLOCK; + if trunc_byte != 0 || trunc_offset != 0 { + let old_content = modification + .tline + .get_rel_page_at_lsn(rel, vm_page_no, Version::Modified(modification), ctx) + .await?; + let mut new_content = BytesMut::new(); + new_content.extend_from_slice(&old_content); + let map = &mut new_content[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + map[trunc_byte + 1..].fill(0u8); + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[trunc_byte] &= (1 << trunc_offset) - 1; + modification.put_rel_page_image(rel, vm_page_no, new_content.freeze())?; vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; From 858e320b54b3e8907d838d16dbe1c861028af584 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Oct 2024 13:49:45 +0300 Subject: [PATCH 02/11] Copy some comments from visibilitymap.c --- pageserver/src/walingest.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 0720bdede24b..74f0d9eaf0f9 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -621,11 +621,22 @@ impl WalIngest { forknum: VISIBILITYMAP_FORKNUM, }; + // + // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c` + // + + // last remaining block, byte, and bit let mut vm_page_no = rec.blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); let trunc_byte = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE / pg_constants::VM_HEAPBLOCKS_PER_BYTE; let trunc_offset = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE * pg_constants::VM_BITS_PER_HEAPBLOCK; + + // Unless the new size is exactly at a visibility map page boundary, the + // tail bits in the last remaining map page, representing truncated heap + // blocks, need to be cleared. This is not only tidy, but also necessary + // because we don't get a chance to clear the bits if the heap is extended + // again. if trunc_byte != 0 || trunc_offset != 0 { let old_content = modification .tline @@ -634,7 +645,10 @@ impl WalIngest { let mut new_content = BytesMut::new(); new_content.extend_from_slice(&old_content); let map = &mut new_content[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + // Clear out the unwanted bytes. map[trunc_byte + 1..].fill(0u8); + /*---- * Mask out the unwanted bits of the last remaining byte. * @@ -646,9 +660,11 @@ impl WalIngest { *---- */ map[trunc_byte] &= (1 << trunc_offset) - 1; + modification.put_rel_page_image(rel, vm_page_no, new_content.freeze())?; vm_page_no += 1; } + let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position From 3b11b8cee4289b067be42a477bed88974dfd66a7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 10 Oct 2024 16:30:15 +0300 Subject: [PATCH 03/11] Handle absence of truncated VM page --- pageserver/src/walingest.rs | 44 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 74f0d9eaf0f9..2970a8767da8 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -638,30 +638,30 @@ impl WalIngest { // because we don't get a chance to clear the bits if the heap is extended // again. if trunc_byte != 0 || trunc_offset != 0 { - let old_content = modification + if let Ok(old_content) = modification .tline .get_rel_page_at_lsn(rel, vm_page_no, Version::Modified(modification), ctx) - .await?; - let mut new_content = BytesMut::new(); - new_content.extend_from_slice(&old_content); - let map = &mut new_content[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - // Clear out the unwanted bytes. - map[trunc_byte + 1..].fill(0u8); - - /*---- - * Mask out the unwanted bits of the last remaining byte. - * - * ((1 << 0) - 1) = 00000000 - * ((1 << 1) - 1) = 00000001 - * ... - * ((1 << 6) - 1) = 00111111 - * ((1 << 7) - 1) = 01111111 - *---- - */ - map[trunc_byte] &= (1 << trunc_offset) - 1; - - modification.put_rel_page_image(rel, vm_page_no, new_content.freeze())?; + .await + { + let mut new_content = BytesMut::new(); + new_content.extend_from_slice(&old_content); + let map = &mut new_content[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + map[trunc_byte + 1..].fill(0u8); + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[trunc_byte] &= (1 << trunc_offset) - 1; + modification.put_rel_page_image(rel, vm_page_no, new_content.freeze())?; + } else { + modification.put_rel_page_image_zero(rel, vm_page_no)?; + } vm_page_no += 1; } From 9a4e9116ea219d9e5057218e39393aeb60cc4fe2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Oct 2024 09:38:19 +0300 Subject: [PATCH 04/11] Add more logging to understand CI errors --- libs/pageserver_api/src/shard.rs | 5 ++++- pageserver/src/pgdatadir_mapping.rs | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e83cf4c855a1..245377a0be68 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -163,7 +163,10 @@ impl ShardIdentity { /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); - if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { + if self.count < ShardCount(2) + || (key_is_shard0(key) && self.number == ShardNumber(0)) + || key.is_rel_vm_block_key() + { true } else { key_to_shard_number(self.count, self.stripe_size, key) == self.number diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index dc2dc08b534a..6bc9393dd6e1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1905,6 +1905,13 @@ impl<'a> DatadirModification<'a> { } self.pending_bytes += val_serialized_size; + if Key::from_compact(key).is_rel_vm_block_key() { + let backtrace = std::backtrace::Backtrace::capture(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + info!("Update VM page {key}\n{backtrace}"); + } + } self.pending_data_pages .push((key, self.lsn, val_serialized_size, val)) } From fce10bc5d7fe1cb9222d70e816e1d997411e8d05 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Oct 2024 15:06:57 +0300 Subject: [PATCH 05/11] Check that VM page belongsto shard in truncate --- libs/pageserver_api/src/shard.rs | 5 +---- pageserver/src/walingest.rs | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 245377a0be68..e83cf4c855a1 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -163,10 +163,7 @@ impl ShardIdentity { /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); - if self.count < ShardCount(2) - || (key_is_shard0(key) && self.number == ShardNumber(0)) - || key.is_rel_vm_block_key() - { + if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { true } else { key_to_shard_number(self.count, self.stripe_size, key) == self.number diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2970a8767da8..4dcaef50d911 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -637,7 +637,9 @@ impl WalIngest { // blocks, need to be cleared. This is not only tidy, but also necessary // because we don't get a chance to clear the bits if the heap is extended // again. - if trunc_byte != 0 || trunc_offset != 0 { + if (trunc_byte != 0 || trunc_offset != 0) + && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no)) + { if let Ok(old_content) = modification .tline .get_rel_page_at_lsn(rel, vm_page_no, Version::Modified(modification), ctx) From 182139a456ec430444aa15bbe6747b5e2a1e3d2c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Oct 2024 17:36:20 +0300 Subject: [PATCH 06/11] Donot print stack trace --- pageserver/src/pgdatadir_mapping.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6bc9393dd6e1..ffe44e5b4925 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1874,6 +1874,7 @@ impl<'a> DatadirModification<'a> { // data pages during ingest. if cfg!(debug_assertions) { for (dirty_key, _, _, _) in &self.pending_data_pages { + tracing::error!("dirty_key={dirty_key}"); debug_assert!(&key.to_compact() != dirty_key); } @@ -1905,13 +1906,6 @@ impl<'a> DatadirModification<'a> { } self.pending_bytes += val_serialized_size; - if Key::from_compact(key).is_rel_vm_block_key() { - let backtrace = std::backtrace::Backtrace::capture(); - - if backtrace.status() == std::backtrace::BacktraceStatus::Captured { - info!("Update VM page {key}\n{backtrace}"); - } - } self.pending_data_pages .push((key, self.lsn, val_serialized_size, val)) } From f5da1896b6e51f866d35789a2f6f8b8daea5e12b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 15 Oct 2024 12:16:01 +0300 Subject: [PATCH 07/11] Move TruncateVisibilityMap to the end of NeonWalRecord enum to preserve serialed representation of other members --- libs/postgres_ffi/src/walrecord.rs | 56 ++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index dedbaef64d37..51d375e8346f 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -15,6 +15,7 @@ use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; +<<<<<<< HEAD:libs/postgres_ffi/src/walrecord.rs #[repr(C)] #[derive(Debug)] pub struct XlMultiXactCreate { @@ -25,6 +26,61 @@ pub struct XlMultiXactCreate { pub nmembers: u32, /* number of member XIDs */ pub members: Vec, +======= +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum NeonWalRecord { + /// Native PostgreSQL WAL record + Postgres { will_init: bool, rec: Bytes }, + + /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) + ClearVisibilityMapFlags { + new_heap_blkno: Option, + old_heap_blkno: Option, + flags: u8, + }, + /// Mark transaction IDs as committed on a CLOG page + ClogSetCommitted { + xids: Vec, + timestamp: TimestampTz, + }, + /// Mark transaction IDs as aborted on a CLOG page + ClogSetAborted { xids: Vec }, + /// Extend multixact offsets SLRU + MultixactOffsetCreate { + mid: MultiXactId, + moff: MultiXactOffset, + }, + /// Extend multixact members SLRU. + MultixactMembersCreate { + moff: MultiXactOffset, + members: Vec, + }, + /// Update the map of AUX files, either writing or dropping an entry + AuxFile { + file_path: String, + content: Option, + }, + // Truncate visibility map page + TruncateVisibilityMap { + trunc_byte: usize, + trunc_offs: usize, + }, + + /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. + #[cfg(test)] + Test { + /// Append a string to the image. + append: String, + /// Clear the image before appending. + clear: bool, + /// Treat this record as an init record. `clear` should be set to true if this field is set + /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and + /// its references in `timeline.rs`. + will_init: bool, + }, +>>>>>>> d09c9fca1 (Move TruncateVisibilityMap to the end of NeonWalRecord enum to preserve serialed representation of other members):pageserver/src/walrecord.rs } impl XlMultiXactCreate { From c5c74cad813e9cea4fb85f31db580d7ca5b9e425 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 25 Oct 2024 09:10:04 +0300 Subject: [PATCH 08/11] Fix merge problems --- pageserver/src/walingest.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 4dcaef50d911..5394926821a9 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -626,10 +626,10 @@ impl WalIngest { // // last remaining block, byte, and bit - let mut vm_page_no = rec.blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); - let trunc_byte = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE + let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); + let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE / pg_constants::VM_HEAPBLOCKS_PER_BYTE; - let trunc_offset = rec.blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE + let trunc_offset = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE * pg_constants::VM_BITS_PER_HEAPBLOCK; // Unless the new size is exactly at a visibility map page boundary, the From 69e0aaf6ef75a763f6db8f14d3999b9509cf9887 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 1 Nov 2024 10:27:08 +0200 Subject: [PATCH 09/11] Add test_vm_truncate test --- libs/pageserver_api/src/record.rs | 5 +++ libs/postgres_ffi/src/walrecord.rs | 56 ------------------------- pageserver/src/walingest.rs | 41 +++++------------- pageserver/src/walredo/apply_neon.rs | 28 +++++++++++++ test_runner/regress/test_vm_truncate.py | 27 ++++++++++++ 5 files changed, 70 insertions(+), 87 deletions(-) create mode 100644 test_runner/regress/test_vm_truncate.py diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index b80ed2f20353..f5cb6213075f 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -41,6 +41,11 @@ pub enum NeonWalRecord { file_path: String, content: Option, }, + // Truncate visibility map page + TruncateVisibilityMap { + trunc_byte: usize, + trunc_offs: usize, + }, /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. #[cfg(feature = "testing")] diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index 51d375e8346f..dedbaef64d37 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -15,7 +15,6 @@ use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; -<<<<<<< HEAD:libs/postgres_ffi/src/walrecord.rs #[repr(C)] #[derive(Debug)] pub struct XlMultiXactCreate { @@ -26,61 +25,6 @@ pub struct XlMultiXactCreate { pub nmembers: u32, /* number of member XIDs */ pub members: Vec, -======= -/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom neon-specific "record". -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum NeonWalRecord { - /// Native PostgreSQL WAL record - Postgres { will_init: bool, rec: Bytes }, - - /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) - ClearVisibilityMapFlags { - new_heap_blkno: Option, - old_heap_blkno: Option, - flags: u8, - }, - /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { - xids: Vec, - timestamp: TimestampTz, - }, - /// Mark transaction IDs as aborted on a CLOG page - ClogSetAborted { xids: Vec }, - /// Extend multixact offsets SLRU - MultixactOffsetCreate { - mid: MultiXactId, - moff: MultiXactOffset, - }, - /// Extend multixact members SLRU. - MultixactMembersCreate { - moff: MultiXactOffset, - members: Vec, - }, - /// Update the map of AUX files, either writing or dropping an entry - AuxFile { - file_path: String, - content: Option, - }, - // Truncate visibility map page - TruncateVisibilityMap { - trunc_byte: usize, - trunc_offs: usize, - }, - - /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. - #[cfg(test)] - Test { - /// Append a string to the image. - append: String, - /// Clear the image before appending. - clear: bool, - /// Treat this record as an init record. `clear` should be set to true if this field is set - /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and - /// its references in `timeline.rs`. - will_init: bool, - }, ->>>>>>> d09c9fca1 (Move TruncateVisibilityMap to the end of NeonWalRecord enum to preserve serialed representation of other members):pageserver/src/walrecord.rs } impl XlMultiXactCreate { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 5394926821a9..366e47ef2a63 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -621,15 +621,11 @@ impl WalIngest { forknum: VISIBILITYMAP_FORKNUM, }; - // - // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c` - // - // last remaining block, byte, and bit let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE / pg_constants::VM_HEAPBLOCKS_PER_BYTE; - let trunc_offset = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE + let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE * pg_constants::VM_BITS_PER_HEAPBLOCK; // Unless the new size is exactly at a visibility map page boundary, the @@ -637,36 +633,19 @@ impl WalIngest { // blocks, need to be cleared. This is not only tidy, but also necessary // because we don't get a chance to clear the bits if the heap is extended // again. - if (trunc_byte != 0 || trunc_offset != 0) + if (trunc_byte != 0 || trunc_offs != 0) && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no)) { - if let Ok(old_content) = modification - .tline - .get_rel_page_at_lsn(rel, vm_page_no, Version::Modified(modification), ctx) - .await - { - let mut new_content = BytesMut::new(); - new_content.extend_from_slice(&old_content); - let map = &mut new_content[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - map[trunc_byte + 1..].fill(0u8); - /*---- - * Mask out the unwanted bits of the last remaining byte. - * - * ((1 << 0) - 1) = 00000000 - * ((1 << 1) - 1) = 00000001 - * ... - * ((1 << 6) - 1) = 00111111 - * ((1 << 7) - 1) = 01111111 - *---- - */ - map[trunc_byte] &= (1 << trunc_offset) - 1; - modification.put_rel_page_image(rel, vm_page_no, new_content.freeze())?; - } else { - modification.put_rel_page_image_zero(rel, vm_page_no)?; - } + modification.put_rel_wal_record( + rel, + vm_page_no, + NeonWalRecord::TruncateVisibilityMap { + trunc_byte, + trunc_offs, + }, + )?; vm_page_no += 1; } - let nblocks = get_relsize(modification, rel, ctx).await?; if nblocks > vm_page_no { // check if something to do: VM is larger than truncate position diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 7aaa357318bd..9cfb89c4c6db 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -42,6 +42,34 @@ pub(crate) fn apply_in_neon( } => { anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); } + // + // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c` + // + NeonWalRecord::TruncateVisibilityMap { + trunc_byte, + trunc_offs, + } => { + // sanity check that this is modifying the correct relation + let (rel, _) = key.to_rel_block().context("invalid record")?; + assert!( + rel.forknum == VISIBILITYMAP_FORKNUM, + "TruncateVisibilityMap record on unexpected rel {}", + rel + ); + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + map[*trunc_byte + 1..].fill(0u8); + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[*trunc_byte] &= (1 << *trunc_offs) - 1; + } NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, diff --git a/test_runner/regress/test_vm_truncate.py b/test_runner/regress/test_vm_truncate.py new file mode 100644 index 000000000000..bca482ef65ad --- /dev/null +++ b/test_runner/regress/test_vm_truncate.py @@ -0,0 +1,27 @@ +from fixtures.neon_fixtures import NeonEnv + + +# +# Test that VM is properly truncated +# +def test_vm_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + cur.execute("CREATE EXTENSION neon_test_utils") + cur.execute("CREATE EXTENSION pageinspect") + + cur.execute( + "create table t(pk integer primary key, counter integer default 0, filler text default repeat('?', 200))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000))") + cur.execute("delete from t where pk>10") + cur.execute("vacuum t") + cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") + pg_bitmap = cur.fetchall()[0][0] + cur.execute("SELECT clear_buffer_cache()") + cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") + ps_bitmap = cur.fetchall()[0][0] + assert pg_bitmap == ps_bitmap From 5ed26b10125d3abaa9a03f8e823c1836ebb905e4 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 1 Nov 2024 14:50:48 +0200 Subject: [PATCH 10/11] Add comments to the test --- pageserver/src/pgdatadir_mapping.rs | 1 - test_runner/regress/test_vm_truncate.py | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ffe44e5b4925..dc2dc08b534a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1874,7 +1874,6 @@ impl<'a> DatadirModification<'a> { // data pages during ingest. if cfg!(debug_assertions) { for (dirty_key, _, _, _) in &self.pending_data_pages { - tracing::error!("dirty_key={dirty_key}"); debug_assert!(&key.to_compact() != dirty_key); } diff --git a/test_runner/regress/test_vm_truncate.py b/test_runner/regress/test_vm_truncate.py index bca482ef65ad..c0accdcd7a4e 100644 --- a/test_runner/regress/test_vm_truncate.py +++ b/test_runner/regress/test_vm_truncate.py @@ -18,10 +18,14 @@ def test_vm_truncate(neon_simple_env: NeonEnv): ) cur.execute("insert into t (pk) values (generate_series(1,1000))") cur.execute("delete from t where pk>10") - cur.execute("vacuum t") + cur.execute("vacuum t") # rtruncate all forks pf reation + # get image of first block of VM updted by Postgres excluding page header cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") pg_bitmap = cur.fetchall()[0][0] + # flush shared buffers cur.execute("SELECT clear_buffer_cache()") + # now downoad firstblock of VM from page server ... cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") ps_bitmap = cur.fetchall()[0][0] + # and check that they are equal, i.e. PS is producing the same VM page as Postgres assert pg_bitmap == ps_bitmap From 7303d05fd55b42ae9082a282ebc611e27e06852b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 13 Nov 2024 08:46:44 +0200 Subject: [PATCH 11/11] Fix mistypings in comments --- test_runner/regress/test_vm_truncate.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_vm_truncate.py b/test_runner/regress/test_vm_truncate.py index c0accdcd7a4e..43b4f2d8b10b 100644 --- a/test_runner/regress/test_vm_truncate.py +++ b/test_runner/regress/test_vm_truncate.py @@ -18,14 +18,16 @@ def test_vm_truncate(neon_simple_env: NeonEnv): ) cur.execute("insert into t (pk) values (generate_series(1,1000))") cur.execute("delete from t where pk>10") - cur.execute("vacuum t") # rtruncate all forks pf reation - # get image of first block of VM updted by Postgres excluding page header + cur.execute("vacuum t") # truncates the relation, including its VM and FSM + # get image of the first block of the VM excluding the page header. It's expected + # to still be in the buffer cache. + # ignore page header (24 bytes, 48 - it's hex representation) cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") pg_bitmap = cur.fetchall()[0][0] # flush shared buffers cur.execute("SELECT clear_buffer_cache()") - # now downoad firstblock of VM from page server ... + # now download the first block of the VM from the pageserver ... cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") ps_bitmap = cur.fetchall()[0][0] - # and check that they are equal, i.e. PS is producing the same VM page as Postgres + # and check that content of bitmaps are equal, i.e. PS is producing the same VM page as Postgres assert pg_bitmap == ps_bitmap