From 67e091c906a2863e32a8599d4f0aca90fc756b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 17 Aug 2022 23:24:47 +0300 Subject: [PATCH] Rework `init` in pageserver CLI (#2272) * Do not create initial tenant and timeline (adjust Python tests for that) * Rework config handling during init, add --update-config to manage local config updates --- Dockerfile | 3 - control_plane/src/local_env.rs | 10 +- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 206 +++++++++--------- docker-entrypoint.sh | 24 -- neon_local/src/main.rs | 62 +++--- pageserver/src/bin/pageserver.rs | 176 ++++++++------- pageserver/src/tenant_mgr.rs | 7 +- pageserver/src/timelines.rs | 64 +----- pageserver/src/walredo.rs | 18 -- .../batch_others/test_pageserver_api.py | 45 +++- .../batch_others/test_tenant_relocation.py | 14 +- 12 files changed, 278 insertions(+), 353 deletions(-) delete mode 100755 docker-entrypoint.sh diff --git a/Dockerfile b/Dockerfile index 1afaa41fb4c9..17aa0025e8c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,10 +58,7 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ -COPY docker-entrypoint.sh /docker-entrypoint.sh - VOLUME ["/data"] USER zenith EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["pageserver"] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e0b409f32dd2..75e552f6ccab 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use crate::safekeeper::SafekeeperNode; // This data structures represents neon_local CLI config // // It is deserialized from the .neon/config file, or the config file passed -// to 'zenith init --config=' option. See control_plane/simple.conf for +// to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // #[serde_as] @@ -320,7 +320,7 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'zenith init' first", + "Zenith config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } @@ -337,12 +337,12 @@ impl LocalEnv { } pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'zenith init --config=' + // Currently, the user first passes a config file with 'neon_local init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line +# and safekeeeper node. It is read by the 'neon_local' command-line # utility. "# .to_string(); @@ -382,7 +382,7 @@ impl LocalEnv { } // - // Initialize a new Zenith repository + // Initialize a new Neon repository // pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 3fda856e135b..652736058a39 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -51,7 +51,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, )) } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 31858278d3f3..aab29628e3b9 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; use std::num::NonZeroU64; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Duration; use std::{io, result, thread}; @@ -102,23 +102,19 @@ impl PageServerNode { /// Construct libpq connection string for connecting to the pageserver. fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { - format!("postgresql://no_user:{}@{}/no_db", password, listen_addr) + format!("postgresql://no_user:{password}@{listen_addr}/no_db") .parse() .unwrap() } - pub fn init( + pub fn initialize( &self, create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], ) -> anyhow::Result { - let mut cmd = Command::new(self.env.pageserver_bin()?); - let id = format!("id={}", self.env.pageserver.id); - // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); @@ -138,67 +134,52 @@ impl PageServerNode { .collect::>() .join(",") ); - let mut args = Vec::with_capacity(20); - - args.push("--init"); - args.extend(["-D", &base_data_dir_param]); - args.extend(["-c", &pg_distrib_dir_param]); - args.extend(["-c", &authg_type_param]); - args.extend(["-c", &listen_http_addr_param]); - args.extend(["-c", &listen_pg_addr_param]); - args.extend(["-c", &broker_endpoints_param]); - args.extend(["-c", &id]); - let broker_etcd_prefix_param = self .env .etcd_broker .broker_etcd_prefix .as_ref() .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); - if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { - args.extend(["-c", broker_etcd_prefix_param]); - } - for config_override in config_overrides { - args.extend(["-c", config_override]); - } + let mut init_config_overrides = config_overrides.to_vec(); + init_config_overrides.push(&id); + init_config_overrides.push(&pg_distrib_dir_param); + init_config_overrides.push(&authg_type_param); + init_config_overrides.push(&listen_http_addr_param); + init_config_overrides.push(&listen_pg_addr_param); + init_config_overrides.push(&broker_endpoints_param); - if self.env.pageserver.auth_type != AuthType::Trust { - args.extend([ - "-c", - "auth_validation_public_key_path='auth_public_key.pem'", - ]); + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + init_config_overrides.push(broker_etcd_prefix_param); } - let create_tenant = create_tenant.map(|id| id.to_string()); - if let Some(tenant_id) = create_tenant.as_deref() { - args.extend(["--create-tenant", tenant_id]) + if self.env.pageserver.auth_type != AuthType::Trust { + init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); } - let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - let initial_timeline_id_string = initial_timeline_id.to_string(); - args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - - let cmd_with_args = cmd.args(args); - let init_output = fill_rust_env_vars(cmd_with_args) - .output() - .with_context(|| { - format!("failed to init pageserver with command {:?}", cmd_with_args) - })?; - - if !init_output.status.success() { - bail!( - "init invocation failed, {}\nStdout: {}\nStderr: {}", - init_output.status, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr) - ); + self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; + let init_result = self + .try_init_timeline(create_tenant, initial_timeline_id) + .context("Failed to create initial tenant and timeline for pageserver"); + match &init_result { + Ok(initial_timeline_id) => { + println!("Successfully initialized timeline {initial_timeline_id}") + } + Err(e) => eprintln!("{e:#}"), } + self.stop(false)?; + init_result + } - // echo the captured output of the init command - println!("{}", String::from_utf8_lossy(&init_output.stdout)); - - Ok(initial_timeline_id) + fn try_init_timeline( + &self, + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = + self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + Ok(initial_timeline_info.timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -210,15 +191,35 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - print!( + self.start_node(config_overrides, &self.repo_path(), false) + } + + fn start_node( + &self, + config_overrides: &[&str], + datadir: &Path, + update_config: bool, + ) -> anyhow::Result<()> { + println!( "Starting pageserver at '{}' in '{}'", connection_address(&self.pg_connection_config), - self.repo_path().display() + datadir.display() ); - io::stdout().flush().unwrap(); - - let repo_path = self.repo_path(); - let mut args = vec!["-D", repo_path.to_str().unwrap()]; + io::stdout().flush()?; + + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!( + "Datadir path '{}' cannot be represented as a unicode string", + datadir.display() + ) + })?, + ]; + + if update_config { + args.push("--update-config"); + } for config_override in config_overrides { args.extend(["-c", config_override]); @@ -230,8 +231,8 @@ impl PageServerNode { if !filled_cmd.status()?.success() { bail!( - "Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display() + "Pageserver failed to start. See console output and '{}' for details.", + datadir.join("pageserver.log").display() ); } @@ -240,7 +241,7 @@ impl PageServerNode { const RETRIES: i8 = 15; for retries in 1..RETRIES { match self.check_status() { - Ok(_) => { + Ok(()) => { println!("\nPageserver started"); return Ok(()); } @@ -254,21 +255,18 @@ impl PageServerNode { if retries == 5 { println!() // put a line break after dots for second message } - println!( - "Pageserver not responding yet, err {} retrying ({})...", - err, retries - ); + println!("Pageserver not responding yet, err {err} retrying ({retries})..."); } } PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {} ", msg) + bail!("pageserver failed to start: {msg} ") } } thread::sleep(Duration::from_secs(1)); } } } - bail!("pageserver failed to start in {} seconds", RETRIES); + bail!("pageserver failed to start in {RETRIES} seconds"); } /// @@ -298,15 +296,11 @@ impl PageServerNode { match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { - println!( - "Pageserver with pid {} does not exist, but a PID file was found", - pid - ); + println!("Pageserver with pid {pid} does not exist, but a PID file was found"); return Ok(()); } Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, + "Failed to send signal to pageserver with pid {pid}: {}", err.desc() ), } @@ -335,13 +329,13 @@ impl PageServerNode { thread::sleep(Duration::from_millis(100)); } - bail!("Failed to stop pageserver with pid {}", pid); + bail!("Failed to stop pageserver with pid {pid}"); } pub fn page_server_psql(&self, sql: &str) -> Vec { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); - println!("Pageserver query: '{}'", sql); + println!("Pageserver query: '{sql}'"); client.simple_query(sql).unwrap() } @@ -376,9 +370,8 @@ impl PageServerNode { &self, new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result> { - let tenant_id_string = self - .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + ) -> anyhow::Result { + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, checkpoint_distance: settings @@ -417,18 +410,16 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::>()?; - - tenant_id_string - .map(|id| { - id.parse().with_context(|| { - format!( - "Failed to parse tennat creation response as tenant id: {}", - id - ) + .json::>() + .with_context(|| { + format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") + })? + .context("No tenant id was found in the tenant creation response") + .and_then(|tenant_id_string| { + tenant_id_string.parse().with_context(|| { + format!("Failed to parse response string as tenant id: '{tenant_id_string}'") }) }) - .transpose() } pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { @@ -499,22 +490,27 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> anyhow::Result> { - let timeline_info_response = self - .http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) -> anyhow::Result { + self.http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id, + ancestor_start_lsn, + ancestor_timeline_id, + }) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse timeline creation response for tenant id: {tenant_id}") + })? + .with_context(|| { + format!( + "No timeline id was found in the timeline creation response for tenant {tenant_id}" ) - .json(&TimelineCreateRequest { - new_timeline_id, - ancestor_start_lsn, - ancestor_timeline_id, - }) - .send()? - .error_from_body()? - .json::>()?; - - Ok(timeline_info_response) + }) } /// Import a basebackup prepared using either: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh deleted file mode 100755 index 75dbdaed7a16..000000000000 --- a/docker-entrypoint.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -eux - -pageserver_id_param="${NODE_ID:-10}" - -broker_endpoints_param="${BROKER_ENDPOINT:-absent}" -if [ "$broker_endpoints_param" != "absent" ]; then - broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" -else - broker_endpoints_param='' -fi - -remote_storage_param="${REMOTE_STORAGE:-}" - -if [ "$1" = 'pageserver' ]; then - if [ ! -d "/data/tenants" ]; then - echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param - fi - echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data -else - "$@" -fi diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index c4dd52e18341..78a465539a49 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -501,10 +501,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); - // Call 'pageserver init'. + // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); let initial_timeline_id = pageserver - .init( + .initialize( Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), @@ -551,25 +551,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .values_of("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver - .tenant_create(initial_tenant_id, tenant_conf)? - .ok_or_else(|| { - anyhow!("Tenant with id {:?} was already created", initial_tenant_id) - })?; - println!( - "tenant {} successfully created on the pageserver", - new_tenant_id - ); + let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + println!("tenant {new_tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline = pageserver - .timeline_create(new_tenant_id, new_timeline_id, None, None)? - .context(format!( - "Failed to create initial timeline for tenant {new_tenant_id}" - ))?; - let new_timeline_id = timeline.timeline_id; - let last_record_lsn = timeline + let timeline_info = + pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let new_timeline_id = timeline_info.timeline_id; + let last_record_lsn = timeline_info .local .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? .last_record_lsn; @@ -616,20 +606,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline = pageserver - .timeline_create(tenant_id, None, None, None)? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}", - timeline.timeline_id, last_record_lsn, tenant_id, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", + timeline_info.timeline_id ); } Some(("import", import_match)) => { @@ -680,10 +668,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) .ok_or_else(|| { - anyhow!( - "Found no timeline id for branch name '{}'", - ancestor_branch_name - ) + anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; let start_lsn = branch_match @@ -691,12 +676,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(Lsn::from_str) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline = pageserver - .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + )?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; @@ -704,11 +692,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", - timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", + timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), None => bail!("no tenant subcommand provided"), } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b53996441490..1a13147f42d0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,6 @@ //! Main entry point for the Page Server executable. -use std::{env, path::Path, str::FromStr}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -13,7 +13,7 @@ use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, - timelines, virtual_file, LOG_FILE_NAME, + virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, @@ -24,7 +24,6 @@ use utils::{ shutdown::exit_now, signals::{self, Signal}, tcp_listener, - zid::{ZTenantId, ZTimelineId}, }; project_git_version!(GIT_VERSION); @@ -42,6 +41,7 @@ fn main() -> anyhow::Result<()> { .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( + Arg::new("daemonize") .short('d') .long("daemonize") @@ -52,7 +52,7 @@ fn main() -> anyhow::Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), + .help("Initialize pageserver with all given config overrides"), ) .arg( Arg::new("workdir") @@ -61,20 +61,6 @@ fn main() -> anyhow::Result<()> { .takes_value(true) .help("Working directory for the pageserver"), ) - .arg( - Arg::new("create-tenant") - .long("create-tenant") - .takes_value(true) - .help("Create tenant during init") - .requires("init"), - ) - .arg( - Arg::new("initial-timeline-id") - .long("initial-timeline-id") - .takes_value(true) - .help("Use a specific timeline id during init and tenant creation") - .requires("create-tenant"), - ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -85,6 +71,9 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg(Arg::new("update-config").long("update-config").takes_value(false).help( + "Update the config file when started", + )) .arg( Arg::new("enabled-features") .long("enabled-features") @@ -110,18 +99,6 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; let cfg_file_path = workdir.join("pageserver.toml"); - let init = arg_matches.is_present("init"); - let create_tenant = arg_matches - .value_of("create-tenant") - .map(ZTenantId::from_str) - .transpose() - .context("Failed to parse tenant id from the arguments")?; - let initial_timeline_id = arg_matches - .value_of("initial-timeline-id") - .map(ZTimelineId::from_str) - .transpose() - .context("Failed to parse timeline id from the arguments")?; - // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { format!( @@ -131,30 +108,86 @@ fn main() -> anyhow::Result<()> { })?; let daemonize = arg_matches.is_present("daemonize"); - if init && daemonize { - bail!("--daemonize cannot be used with --init") + + let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { + ControlFlow::Continue(conf) => conf, + ControlFlow::Break(()) => { + info!("Pageserver config init successful"); + return Ok(()); + } + }; + + let tenants_path = conf.tenants_path(); + if !tenants_path.exists() { + utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + format!( + "Failed to create tenants root dir at '{}'", + tenants_path.display() + ) + })?; } - let mut toml = if init { - // We're initializing the repo, so there's no config file yet - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")? - } else { + // Initialize up failpoints support + let scenario = FailScenario::setup(); + + // Basic initialization of things that don't change after startup + virtual_file::init(conf.max_file_descriptors); + page_cache::init(conf.page_cache_size); + + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + + scenario.teardown(); + Ok(()) +} + +fn initialize_config( + cfg_file_path: &Path, + arg_matches: clap::ArgMatches, + workdir: &Path, +) -> anyhow::Result> { + let init = arg_matches.is_present("init"); + let update_config = init || arg_matches.is_present("update-config"); + + let (mut toml, config_file_exists) = if cfg_file_path.is_file() { + if init { + anyhow::bail!( + "Config file '{}' already exists, cannot init it, use --update-config to update it", + cfg_file_path.display() + ); + } // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) - .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; - cfg_file_contents - .parse::() - .with_context(|| { - format!( - "Failed to read '{}' as pageserver config", - cfg_file_path.display() - ) - })? + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + format!( + "Failed to read pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + ( + cfg_file_contents + .parse::() + .with_context(|| { + format!( + "Failed to parse '{}' as pageserver config", + cfg_file_path.display() + ) + })?, + true, + ) + } else if cfg_file_path.exists() { + anyhow::bail!( + "Config file '{}' exists but is not a regular file", + cfg_file_path.display() + ); + } else { + // We're initializing the repo, so there's no config file yet + ( + DEFAULT_CONFIG_FILE + .parse::() + .context("could not parse built-in config file")?, + false, + ) }; - // Process any extra options given with -c if let Some(values) = arg_matches.values_of("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -165,49 +198,38 @@ fn main() -> anyhow::Result<()> { })?; for (key, item) in doc.iter() { - if key == "id" { - anyhow::ensure!( - init, - "node id can only be set during pageserver init and cannot be overridden" - ); + if config_file_exists && update_config && key == "id" && toml.contains_key(key) { + anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display()); } toml.insert(key, item.clone()); } } } - trace!("Resulting toml: {}", toml); - let conf = PageServerConf::parse_and_validate(&toml, &workdir) - .context("Failed to parse pageserver configuration")?; - - // The configuration is all set up now. Turn it into a 'static - // that can be freely stored in structs and passed across threads - // as a ref. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // Initialize up failpoints support - let scenario = FailScenario::setup(); + debug!("Resulting toml: {toml}"); + let conf = PageServerConf::parse_and_validate(&toml, workdir) + .context("Failed to parse pageserver configuration")?; - // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); - page_cache::init(conf.page_cache_size); + if update_config { + info!("Writing pageserver config to '{}'", cfg_file_path.display()); - // Create repo and exit if init was requested - if init { - timelines::init_pageserver(conf, create_tenant, initial_timeline_id) - .context("Failed to init pageserver")?; - // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( - "Failed to initialize pageserver config at '{}'", + "Failed to write pageserver config to '{}'", cfg_file_path.display() ) })?; - } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + info!( + "Config successfully written to '{}'", + cfg_file_path.display() + ) } - scenario.teardown(); - Ok(()) + Ok(if init { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(Box::leak(Box::new(conf))) + }) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d90cd7371a0f..64f1caa542c3 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,7 +9,6 @@ use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; -use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use anyhow::Context; @@ -284,10 +283,8 @@ pub fn create_tenant_repository( conf, tenant_conf, tenant_id, - CreateRepo::Real { - wal_redo_manager, - remote_index, - }, + wal_redo_manager, + remote_index, )?; v.insert(Tenant { state: TenantState::Idle, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index c5b938c5fe71..ed5975d3bde4 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -13,17 +13,17 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, logging, + crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::import_datadir; use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; -use crate::{import_datadir, LOG_FILE_NAME}; use crate::{ layered_repository::{LayeredRepository, LayeredTimeline}, walredo::WalRedoManager, @@ -36,69 +36,13 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn init_pageserver( - conf: &'static PageServerConf, - create_tenant: Option, - initial_timeline_id: Option, -) -> anyhow::Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - if let Some(tenant_id) = create_tenant { - println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy) - .context("failed to create repo")?; - let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) - .context("failed to create initial timeline")?; - println!("initial timeline {} created", new_timeline_id) - } else if initial_timeline_id.is_some() { - println!("Ignoring initial timeline parameter, due to no tenant id to create given"); - } - - println!("pageserver init succeeded"); - Ok(()) -} - -pub enum CreateRepo { - Real { - wal_redo_manager: Arc, - remote_index: RemoteIndex, - }, - Dummy, -} - pub fn create_repo( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - create_repo: CreateRepo, + wal_redo_manager: Arc, + remote_index: RemoteIndex, ) -> Result> { - let (wal_redo_manager, remote_index) = match create_repo { - CreateRepo::Real { - wal_redo_manager, - remote_index, - } => (wal_redo_manager, remote_index), - CreateRepo::Dummy => { - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - - (wal_redo_manager as _, RemoteIndex::default()) - } - }; - let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 85f970a941f8..57817dbc9c1e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -82,24 +82,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -/// -/// A dummy WAL Redo Manager implementation that doesn't allow replaying -/// anything. Currently used during bootstrapping (zenith init), to create -/// a Repository object without launching the real WAL redo process. -/// -pub struct DummyRedoManager {} -impl crate::walredo::WalRedoManager for DummyRedoManager { - fn request_redo( - &self, - _key: Key, - _lsn: Lsn, - _base_img: Option, - _records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - Err(WalRedoError::InvalidState) - } -} - // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 51df41699a72..710b220ae8d0 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,7 +1,11 @@ from typing import Optional from uuid import uuid4, UUID import pytest +import pathlib +import os +import subprocess from fixtures.utils import lsn_from_hex +from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -9,16 +13,43 @@ NeonPageserverHttpClient, NeonPageserverApiException, wait_until, + neon_binpath, + pg_distrib_dir, ) -# test that we cannot override node id -def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init() - with pytest.raises( - Exception, - match="node id can only be set during pageserver init and cannot be overridden"): - env.pageserver.start(overrides=['--pageserver-config-override=id=10']) +# test that we cannot override node id after init +def test_pageserver_init_node_id(neon_simple_env: NeonEnv): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / 'pageserver.toml' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # remove initial config + pageserver_config.unlink() + + bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), 'config file should not be created after init error' + + completed_init = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' + assert pageserver_config.exists(), 'config file should be created successfully' + + bad_reinit = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) + assert bad_update.returncode == 1, 'pageserver should not allow updating node id' + assert "has node id already, it cannot be overridden" in bad_update.stderr def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 176ca740fe68..eb65e2e3b523 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -44,30 +44,22 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ + # actually run new pageserver cmd = [ str(pageserver_bin), - '--init', '--workdir', str(new_pageserver_dir), + '--daemonize', + '--update-config', f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", f"-c pg_distrib_dir='{pg_distrib_dir}'", f"-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] - if broker is not None: cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) - subprocess.check_output(cmd, text=True) - - # actually run new pageserver - cmd = [ - str(pageserver_bin), - '--workdir', - str(new_pageserver_dir), - '--daemonize', - ] log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) log.info("started new pageserver %s", out)