diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b0afb6414be2..b4a0d1ac0235 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -99,8 +99,6 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async"; - /// /// Default built-in configuration file. /// @@ -146,8 +144,6 @@ pub mod defaults { #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' -#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}' - [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -300,8 +296,6 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, - - pub walredo_process_kind: crate::walredo::ProcessKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -407,8 +401,6 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, - - walredo_process_kind: BuilderValue, } impl PageServerConfigBuilder { @@ -497,8 +489,6 @@ impl PageServerConfigBuilder { )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - - walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()), } } } @@ -686,10 +676,6 @@ impl PageServerConfigBuilder { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } - pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) { - self.walredo_process_kind = BuilderValue::Set(value); - } - pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -747,7 +733,6 @@ impl PageServerConfigBuilder { max_vectored_read_bytes, validate_vectored_get, ephemeral_bytes_per_memory_kb, - walredo_process_kind, } CUSTOM LOGIC { @@ -1044,9 +1029,6 @@ impl PageServerConf { "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } - "walredo_process_kind" => { - builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?) - } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1130,7 +1112,6 @@ impl PageServerConf { ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), } } } @@ -1370,7 +1351,6 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1444,7 +1424,6 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d660b68a3491..d562540bde9b 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -20,7 +20,6 @@ /// Process lifecycle and abstracction for the IPC protocol. mod process; -pub use process::Kind as ProcessKind; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; @@ -54,7 +53,7 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - /// The current [`process::Process`] that is used by new redo requests. + /// The current [`process::WalRedoProcess`] that is used by new redo requests. /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. @@ -66,7 +65,7 @@ pub struct PostgresRedoManager { /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. - redo_process: heavier_once_cell::OnceCell>, + redo_process: heavier_once_cell::OnceCell>, } /// @@ -211,26 +210,31 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = match self.redo_process.get_or_init_detached().await { - Ok(guard) => Arc::clone(&guard), - Err(permit) => { - // don't hold poison_guard, the launch code can bail - let start = Instant::now(); - let proc = Arc::new( - process::Process::launch(self.conf, self.tenant_shard_id, pg_version) + let proc: Arc = + match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process.set(Arc::clone(&proc), permit); - proc - } - }; + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc + } + }; let started_at = std::time::Instant::now(); diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 02c9c04bf185..5b0af334eee5 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,64 +1,184 @@ -/// Layer of indirection previously used to support multiple implementations. -/// Subject to removal: -use std::time::Duration; - -use bytes::Bytes; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use tracing::warn; -use utils::lsn::Lsn; - -use crate::{config::PageServerConf, walrecord::NeonWalRecord}; - mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -mod process_impl { - pub(super) mod process_async; +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, } -#[derive( - Clone, - Copy, - Debug, - PartialEq, - Eq, - strum_macros::EnumString, - strum_macros::Display, - strum_macros::IntoStaticStr, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -#[repr(u8)] -pub enum Kind { - Sync, - Async, +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, } -pub(crate) struct Process(process_impl::process_async::WalRedoProcess); +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} -impl Process { - #[inline(always)] - pub fn launch( +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - if conf.walredo_process_kind != Kind::Async { - warn!( - configured = %conf.walredo_process_kind, - "the walredo_process_kind setting has been turned into a no-op, using async implementation" - ); - } - Ok(Self(process_impl::process_async::WalRedoProcess::launch( + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { conf, tenant_shard_id, - pg_version, - )?)) + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) } - #[inline(always)] + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] pub(crate) async fn apply_wal_records( &self, rel: RelTag, @@ -67,12 +187,191 @@ impl Process { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - self.0 - .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res } - pub(crate) fn id(&self) -> u32 { - self.0.id() + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only } } diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs deleted file mode 100644 index 262858b033fe..000000000000 --- a/pageserver/src/walredo/process/process_impl/process_async.rs +++ /dev/null @@ -1,374 +0,0 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, - walredo::process::{no_leak_child, protocol}, -}; -use anyhow::Context; -use bytes::Bytes; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; - -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: tokio::sync::Mutex>, - stdin: tokio::sync::Mutex>, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -struct ProcessInput { - stdin: tokio::process::ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: tokio::process::ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - let stdin = - tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; - let stdout = tokio::process::ChildStdout::from_std(stdout) - .context("convert to tokio::ChildStdout")?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: tokio::sync::Mutex::new(Poison::new( - "stdin", - ProcessInput { - stdin, - n_requests: 0, - }, - )), - stdout: tokio::sync::Mutex::new(Poison::new( - "stdout", - ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }, - )), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - /// Apply given WAL records ('records') over an old page image. Returns - /// new page image. - /// - /// # Cancel-Safety - /// - /// Cancellation safe. - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) async fn apply_wal_records( - &self, - rel: RelTag, - blknum: u32, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let Ok(res) = - tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await - else { - anyhow::bail!("WAL redo timed out"); - }; - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - /// # Cancel-Safety - /// - /// When not polled to completion (e.g. because in `tokio::select!` another - /// branch becomes ready before this future), concurrent and subsequent - /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. - /// Dispose of this process instance and create a new one. - async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { - let request_no = { - let mut lock_guard = self.stdin.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let input = poison_guard.data_mut(); - input - .stdin - .write_all(writebuf) - .await - .context("write to walredo stdin")?; - let request_no = input.n_requests; - input.n_requests += 1; - poison_guard.disarm(); - request_no - }; - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut lock_guard = self.stdout.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let output = poison_guard.data_mut(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - output - .stdout - .read_exact(&mut resultbuf) - .await - .context("read walredo stdout")?; - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - poison_guard.disarm(); - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - use std::io::Write; - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -}