diff --git a/.config/nextest.toml b/.config/nextest.toml index a9398e4ab06a..affdc16f31ed 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = { period = "20s", terminate-after = 3 } +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.github/actionlint.yml b/.github/actionlint.yml index cb36e2eee63f..942861ecd803 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,11 +1,9 @@ self-hosted-runner: labels: - arm64 - - dev - gen3 - large - # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged. - - macos-14 + - large-arm64 - small - us-east-2 config-variables: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index c527cef1ac2b..bdf00bcaae1b 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -39,7 +39,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }} + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 606564f2095b..21e7a56670c2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -236,27 +236,6 @@ jobs: submodules: true fetch-depth: 1 - - name: Check Postgres submodules revision - shell: bash -euo pipefail {0} - run: | - # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally). - # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 - - FAILED=false - for postgres in postgres-v14 postgres-v15 postgres-v16; do - expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') - actual=$(git rev-parse "HEAD:vendor/${postgres}") - if [ "${expected}" != "${actual}" ]; then - echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'" - FAILED=true - fi - done - - if [ "${FAILED}" = "true" ]; then - echo >&2 "Please update vendor/revisions.json if these changes are intentional" - exit 1 - fi - - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT @@ -362,6 +341,9 @@ jobs: env: NEXTEST_RETRIES: 3 run: | + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES done diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 5a2f9d6645ed..fdb03963fbf5 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -136,7 +136,7 @@ jobs: check-linux-arm-build: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] env: # Use release build only, to have less debug info around @@ -232,20 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test env: NEXTEST_RETRIES: 3 run: | - cargo nextest run $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -255,12 +255,12 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo nextest run --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) check-codestyle-rust-arm: needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, large-arm64 ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -269,6 +269,11 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: - name: Fix git ownership run: | @@ -305,31 +310,35 @@ jobs: exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + if: matrix.build_type == 'debug' run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Run cargo clippy (release) + if: matrix.build_type == 'release' run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items + if: matrix.build_type == 'release' + run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo deny check gather-rust-build-stats: @@ -338,7 +347,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -369,7 +378,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/Cargo.lock b/Cargo.lock index 8438dad41b8c..6ce7180d67db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.9" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.1.4" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -309,14 +309,15 @@ dependencies = [ "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -326,9 +327,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -373,10 +374,11 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.14.0" +version = "1.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" dependencies = [ + "ahash", "aws-credential-types", "aws-runtime", "aws-sigv4", @@ -391,20 +393,25 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", + "fastrand 2.0.0", + "hex", + "hmac", "http 0.2.9", "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", "regex-lite", + "sha2", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", "aws-runtime", @@ -424,9 +431,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", "aws-runtime", @@ -446,9 +453,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.12.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -469,9 +476,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -498,9 +505,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.1.8" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -509,9 +516,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -541,9 +548,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -581,9 +588,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.1.8" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -594,6 +601,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", + "http-body 1.0.0", "hyper 0.14.26", "hyper-rustls 0.24.0", "once_cell", @@ -606,9 +614,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.2.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -623,16 +631,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.9", + "http 1.1.0", "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -646,18 +657,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.8" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -1348,6 +1359,7 @@ dependencies = [ "tokio-postgres", "tokio-util", "toml", + "toml_edit", "tracing", "url", "utils", @@ -2934,6 +2946,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.0", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -4371,6 +4392,7 @@ dependencies = [ "hyper 1.2.0", "hyper-tungstenite", "hyper-util", + "indexmap 2.0.1", "ipnet", "itertools", "lasso", diff --git a/Cargo.toml b/Cargo.toml index a6d406dc2f6e..17f30a1327d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,14 +52,14 @@ azure_storage_blobs = "0.19" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.14" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" aws-sdk-iam = "1.15.0" -aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.1.4" -aws-credential-types = "1.1.4" -aws-sigv4 = { version = "1.2.0", features = ["sign-http"] } -aws-types = "1.1.7" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -99,6 +99,7 @@ humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" hyper-tungstenite = "0.13.0" +indexmap = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" diff --git a/Makefile b/Makefile index 5e2b3c43672c..dcbfdbcbc15a 100644 --- a/Makefile +++ b/Makefile @@ -81,11 +81,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* - (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ + + VERSION=$*; \ + EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ + (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) + $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) # nicer alias to run 'configure' # Note: I've been unable to use templates for this part of our configuration. diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 67c5250376c4..9295f091d58e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -51,6 +51,7 @@ use tracing::{error, info, warn}; use url::Url; use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeSpec; use compute_tools::compute::{ forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, @@ -69,6 +70,34 @@ use compute_tools::swap::resize_swap; const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { + let (build_tag, clap_args) = init()?; + + let (pg_handle, start_pg_result) = { + // Enter startup tracing context + let _startup_context_guard = startup_context_from_env(); + + let cli_args = process_cli(&clap_args)?; + + let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + + let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; + + start_postgres(&clap_args, wait_spec_result)? + + // Startup is finished, exit the startup tracing span + }; + + // PostgreSQL is now running, if startup was successful. Wait until it exits. + let wait_pg_result = wait_postgres(pg_handle)?; + + let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; + + maybe_delay_exit(delay_exit); + + deinit_and_exit(wait_pg_result); +} + +fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -83,9 +112,15 @@ fn main() -> Result<()> { .to_string(); info!("build_tag: {build_tag}"); - let matches = cli().get_matches(); - let pgbin_default = String::from("postgres"); - let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); + Ok((build_tag, cli().get_matches())) +} + +fn process_cli(matches: &clap::ArgMatches) -> Result { + let pgbin_default = "postgres"; + let pgbin = matches + .get_one::("pgbin") + .map(|s| s.as_str()) + .unwrap_or(pgbin_default); let ext_remote_storage = matches .get_one::("remote-ext-config") @@ -113,6 +148,30 @@ fn main() -> Result<()> { let spec_path = matches.get_one::("spec-path"); let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); + Ok(ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + http_port, + spec_json, + spec_path, + resize_swap_on_bind, + }) +} + +struct ProcessCliResult<'clap> { + connstr: &'clap str, + pgdata: &'clap str, + pgbin: &'clap str, + ext_remote_storage: Option<&'clap str>, + http_port: u16, + spec_json: Option<&'clap String>, + spec_path: Option<&'clap String>, + resize_swap_on_bind: bool, +} + +fn startup_context_from_env() -> Option { // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -149,7 +208,7 @@ fn main() -> Result<()> { if let Ok(val) = std::env::var("TRACESTATE") { startup_tracing_carrier.insert("tracestate".to_string(), val); } - let startup_context_guard = if !startup_tracing_carrier.is_empty() { + if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() @@ -159,8 +218,17 @@ fn main() -> Result<()> { Some(guard) } else { None - }; + } +} +fn try_spec_from_cli( + matches: &clap::ArgMatches, + ProcessCliResult { + spec_json, + spec_path, + .. + }: &ProcessCliResult, +) -> Result { let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -201,6 +269,34 @@ fn main() -> Result<()> { } }; + Ok(CliSpecParams { + spec, + live_config_allowed, + }) +} + +struct CliSpecParams { + /// If a spec was provided via CLI or file, the [`ComputeSpec`] + spec: Option, + live_config_allowed: bool, +} + +fn wait_spec( + build_tag: String, + ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + resize_swap_on_bind, + http_port, + .. + }: ProcessCliResult, + CliSpecParams { + spec, + live_config_allowed, + }: CliSpecParams, +) -> Result { let mut new_state = ComputeState::new(); let spec_set; @@ -239,8 +335,6 @@ fn main() -> Result<()> { let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); - let extension_server_port: u16 = http_port; - if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); @@ -269,6 +363,29 @@ fn main() -> Result<()> { state.start_time = now; } + Ok(WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }) +} + +struct WaitSpecResult { + compute: Arc, + // passed through from ProcessCliResult + http_port: u16, + resize_swap_on_bind: bool, +} + +fn start_postgres( + // need to allow unused because `matches` is only used if target_os = "linux" + #[allow(unused_variables)] matches: &clap::ArgMatches, + WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }: WaitSpecResult, +) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); state.status = ComputeStatus::Init; @@ -318,10 +435,10 @@ fn main() -> Result<()> { } } + let extension_server_port: u16 = http_port; + // Start Postgres let mut pg = None; - let mut exit_code = None; - if !prestartup_failed { pg = match compute.start_compute(extension_server_port) { Ok(pg) => Some(pg), @@ -376,7 +493,7 @@ fn main() -> Result<()> { // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); - let vm_monitor = &rt.as_ref().map(|rt| { + let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: cgroup.cloned(), @@ -389,12 +506,41 @@ fn main() -> Result<()> { } } + Ok(( + pg, + StartPostgresResult { + delay_exit, + compute, + #[cfg(target_os = "linux")] + rt, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + vm_monitor, + }, + )) +} + +type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); + +struct StartPostgresResult { + delay_exit: bool, + // passed through from WaitSpecResult + compute: Arc, + + #[cfg(target_os = "linux")] + rt: Option, + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, +} + +fn wait_postgres(pg: Option) -> Result { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. + let mut exit_code = None; if let Some((mut pg, logs_handle)) = pg { - // Startup is finished, exit the startup tracing span - drop(startup_context_guard); - let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); @@ -409,6 +555,25 @@ fn main() -> Result<()> { exit_code = ecode.code() } + Ok(WaitPostgresResult { exit_code }) +} + +struct WaitPostgresResult { + exit_code: Option, +} + +fn cleanup_after_postgres_exit( + StartPostgresResult { + mut delay_exit, + compute, + #[cfg(target_os = "linux")] + vm_monitor, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + rt, + }: StartPostgresResult, +) -> Result { // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -450,13 +615,19 @@ fn main() -> Result<()> { error!("error while checking for core dumps: {err:?}"); } + Ok(delay_exit) +} + +fn maybe_delay_exit(delay_exit: bool) { // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); } +} +fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 2ce041068e3c..e62f3b8a4780 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -28,6 +28,7 @@ serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +toml_edit.workspace = true tokio.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 14b83c125275..18e395e2b5a5 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::{InitForceMode, LocalEnv}; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, +}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; @@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} - /// /// Timelines tree element used as a value in the HashMap. /// @@ -133,7 +98,7 @@ fn main() -> Result<()> { let subcommand_result = match sub_name { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(sub_args, &env)), + "start" => rt.block_on(handle_start_all(&env)), "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), @@ -152,7 +117,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -341,48 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_one("force").expect("we set a default value"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(&pageserver_config_overrides(init_match)) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config().expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -397,15 +379,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { - init_match - .get_many::("pageserver-config-override") - .into_iter() - .flatten() - .map(String::as_str) - .collect() -} - async fn handle_tenant( tenant_match: &ArgMatches, env: &mut local_env::LocalEnv, @@ -837,6 +810,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .copied() .unwrap_or(false); + let allow_multiple = sub_args.get_flag("allow-multiple"); + let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, @@ -854,7 +829,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + } cplane.new_endpoint( &endpoint_id, @@ -883,6 +860,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let remote_ext_config = sub_args.get_one::("remote-ext-config"); + let allow_multiple = sub_args.get_flag("allow-multiple"); + // If --safekeepers argument is given, use only the listed safekeeper nodes. let safekeepers = if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { @@ -908,11 +887,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .cloned() .unwrap_or_default(); - cplane.check_conflicting_endpoints( - endpoint.mode, - endpoint.tenant_id, - endpoint.timeline_id, - )?; + if !allow_multiple { + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; + } let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let conf = env.get_pageserver_conf(pageserver_id).unwrap(); @@ -1068,10 +1049,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)? - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = get_pageserver(env, subcommand_args)?.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -1097,10 +1075,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> exit(1); } - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -1227,7 +1202,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { // Endpoints are not started automatically broker::start_broker_process(env).await?; @@ -1244,10 +1219,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver - .start(&pageserver_config_overrides(sub_match)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); try_stop_all(env, true).await; exit(1); @@ -1388,13 +1360,6 @@ fn cli() -> Command { .required(false) .value_name("stop-mode"); - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .num_args(1) - .action(ArgAction::Append) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - let remote_ext_config_args = Arg::new("remote-ext-config") .long("remote-ext-config") .num_args(1) @@ -1428,9 +1393,7 @@ fn cli() -> Command { let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); let update_catalog = Arg::new("update-catalog") .value_parser(value_parser!(bool)) @@ -1444,20 +1407,25 @@ fn cli() -> Command { .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") .required(false); + let allow_multiple = Arg::new("allow-multiple") + .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") + .long("allow-multiple") + .action(ArgAction::SetTrue) + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) .subcommand( Command::new("init") .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(pageserver_config_args.clone()) .arg(num_pageservers_arg.clone()) .arg( Arg::new("config") .long("config") .required(false) .value_parser(value_parser!(PathBuf)) - .value_name("config"), + .value_name("config") ) .arg(pg_version_arg.clone()) .arg(force_arg) @@ -1539,7 +1507,6 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") - .arg(pageserver_config_args.clone()) ) .subcommand(Command::new("stop") .about("Stop local pageserver") @@ -1547,7 +1514,6 @@ fn cli() -> Command { ) .subcommand(Command::new("restart") .about("Restart local pageserver") - .arg(pageserver_config_args.clone()) ) ) .subcommand( @@ -1601,6 +1567,7 @@ fn cli() -> Command { .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) .arg(update_catalog) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1609,6 +1576,7 @@ fn cli() -> Command { .arg(safekeepers_arg) .arg(remote_ext_config_args) .arg(create_test_user) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") @@ -1660,7 +1628,6 @@ fn cli() -> Command { .subcommand( Command::new("start") .about("Start page server and safekeepers") - .arg(pageserver_config_args) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 6437d04ec896..d13884198eb8 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,7 +3,7 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use clap::ValueEnum; use postgres_backend::AuthType; @@ -23,6 +23,8 @@ use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 15; @@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). @@ -42,59 +44,99 @@ pub struct LocalEnv { // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or // '.neon' if not given. - #[serde(skip)] pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, // Configuration for the storage controller (1 per neon_local environment) - #[serde(default)] pub storage_controller: NeonStorageControllerConf, /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. - #[serde(default)] pub control_plane_compute_hook_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -141,24 +183,18 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - - pub(crate) virtual_file_io_engine: Option, - pub(crate) get_vectored_impl: Option, - pub(crate) get_impl: Option, - pub(crate) validate_vectored_get: Option, } impl Default for PageServerConf { @@ -169,10 +205,40 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - virtual_file_io_engine: None, - get_vectored_impl: None, - get_impl: None, - validate_vectored_get: None, + } + } +} + +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, } } } @@ -360,44 +426,7 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env::current_exe()? - .parent() - .unwrap() - .clone_into(&mut env.neon_distrib_dir); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config + /// Construct `Self` from on-disk state. pub fn load_config() -> anyhow::Result { let repopath = base_path(); @@ -411,38 +440,129 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(&repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + id: NodeId, + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let PageserverConfigTomlSubset { + id: config_toml_id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + config_toml_id == id, + "id mismatch: config_toml.id={config_toml_id} id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; - + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -467,17 +587,13 @@ impl LocalEnv { } } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + // create base_path dir if base_path.exists() { match force { InitForceMode::MustNotExist => { @@ -509,70 +625,96 @@ impl LocalEnv { } } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - self.persist_config(base_path) - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { +pub fn base_path() -> PathBuf { match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), None => PathBuf::from(".neon"), @@ -615,31 +757,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 1a643913062a..5a8476369769 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -4,21 +4,21 @@ //! //! .neon/ //! -use std::borrow::Cow; use std::collections::HashMap; use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; -use std::process::Command; +use std::str::FromStr; use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; use futures::SinkExt; use pageserver_api::models::{ - self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo, + self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, + TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; @@ -30,7 +30,7 @@ use utils::{ lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -74,71 +74,23 @@ impl PageServerNode { } } - /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration. - /// - /// These all end up on the command line of the `pageserver` binary. - fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec { + fn pageserver_init_make_toml( + &self, + conf: NeonLocalInitPageserverConf, + ) -> anyhow::Result { + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let PageServerConf { - id, - listen_pg_addr, - listen_http_addr, - pg_auth_type, - http_auth_type, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - } = &self.conf; - - let id = format!("id={}", id); - - let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); - let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { - format!("virtual_file_io_engine='{virtual_file_io_engine}'") - } else { - String::new() - }; - let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { - format!("get_vectored_impl='{get_vectored_impl}'") - } else { - String::new() - }; - let get_impl = if let Some(get_impl) = get_impl { - format!("get_impl='{get_impl}'") - } else { - String::new() - }; - let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get { - format!("validate_vectored_get={validate_vectored_get}") - } else { - String::new() - }; - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - virtual_file_io_engine, - get_vectored_impl, - get_impl, - validate_vectored_get, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( @@ -148,7 +100,7 @@ impl PageServerNode { // Storage controller uses the same auth as pageserver: if JWT is enabled // for us, we will also need it to talk to them. - if matches!(http_auth_type, AuthType::NeonJWT) { + if matches!(conf.http_auth_type, AuthType::NeonJWT) { let jwt_token = self .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) @@ -157,31 +109,40 @@ impl PageServerNode { } } - if !cli_overrides - .iter() - .any(|c| c.starts_with("remote_storage")) - { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.extend(cli_overrides.iter().map(|&c| c.to_owned())); + overrides.push( + toml_edit::ser::to_string_pretty(&conf) + .expect("we deserialized this from toml earlier"), + ); - overrides + // Turn `overrides` into a toml document. + // TODO: above code is legacy code, it should be refactored to use toml_edit directly. + let mut config_toml = toml_edit::Document::new(); + for fragment_str in overrides { + let fragment = toml_edit::Document::from_str(&fragment_str) + .expect("all fragments in `overrides` are valid toml documents, this function controls that"); + for (key, item) in fragment.iter() { + config_toml.insert(key, item.clone()); + } + } + Ok(config_toml) } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - // First, run `pageserver --init` and wait for it to write a config into FS and exit. - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -197,11 +158,11 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - self.start_node(config_overrides, false).await + pub async fn start(&self) -> anyhow::Result<()> { + self.start_node().await } - fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -212,29 +173,20 @@ impl PageServerNode { ); io::stdout().flush()?; - if !datadir.exists() { - std::fs::create_dir(&datadir)?; - } - - let datadir_path_str = datadir.to_str().with_context(|| { - format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") - })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - args.push(Cow::Borrowed("--init")); - - let init_output = Command::new(self.env.pageserver_bin()) - .args(args.iter().map(Cow::as_ref)) - .envs(self.pageserver_env_variables()?) - .output() - .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; - - anyhow::ensure!( - init_output.status.success(), - "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", - node_id, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr), - ); + let config = self + .pageserver_init_make_toml(conf) + .context("make pageserver toml")?; + let config_file_path = datadir.join("pageserver.toml"); + let mut config_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&config_file_path) + .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; + config_file + .write_all(config.to_string().as_bytes()) + .context("write pageserver toml")?; + drop(config_file); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with // the storage controller @@ -262,11 +214,7 @@ impl PageServerNode { Ok(()) } - async fn start_node( - &self, - config_overrides: &[&str], - update_config: bool, - ) -> anyhow::Result<()> { + async fn start_node(&self) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( @@ -283,15 +231,12 @@ impl PageServerNode { self.conf.id, datadir, ) })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - if update_config { - args.push(Cow::Borrowed("--update-config")); - } + let args = vec!["-D", datadir_path_str]; background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), - args.iter().map(Cow::as_ref), + args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), || async { @@ -308,22 +253,6 @@ impl PageServerNode { Ok(()) } - fn pageserver_basic_args<'a>( - &self, - config_overrides: &'a [&'a str], - datadir_path_str: &'a str, - ) -> Vec> { - let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; - - let overrides = self.neon_local_overrides(config_overrides); - for config_override in overrides { - args.push(Cow::Borrowed("-c")); - args.push(Cow::Owned(config_override)); - } - - args - } - fn pageserver_env_variables(&self) -> anyhow::Result> { // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper // needs a token, and how to generate that token, seems independent to whether @@ -449,11 +378,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - switch_to_aux_file_v2: settings - .remove("switch_to_aux_file_v2") - .map(|x| x.parse::()) + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'switch_to_aux_file_v2' as bool")?, + .context("Failed to parse 'switch_aux_file_policy'")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -572,11 +501,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - switch_to_aux_file_v2: settings - .remove("switch_to_aux_file_v2") - .map(|x| x.parse::()) + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'switch_to_aux_file_v2' as bool")?, + .context("Failed to parse 'switch_aux_file_policy'")?, } }; diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 8e0dbe6ce4f5..141d8a6d0198 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -480,6 +480,15 @@ impl CounterPairVec { let id = self.vec.with_labels(labels); self.vec.remove_metric(id) } + + pub fn sample(&self, labels: ::Group<'_>) -> u64 { + let id = self.vec.with_labels(labels); + let metric = self.vec.get_metric(id); + + let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed); + let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed); + inc.saturating_sub(dec) + } } impl ::measured::metric::group::MetricGroup for CounterPairVec diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index a9ad3aca18fb..c0c4710a00a9 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> { /// pages that would not actually be stored on this node. /// /// Don't use this function in code that works with physical entities like layer files. - fn raw_size(range: &Range) -> u32 { + pub fn raw_size(range: &Range) -> u32 { if is_contiguous_range(range) { contiguous_range_len(range) } else { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a54cdb520d93..1df5820fb946 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,3 +1,4 @@ +pub mod detach_ancestor; pub mod partitioning; pub mod utilization; @@ -8,6 +9,7 @@ use std::{ collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, + str::FromStr, time::{Duration, SystemTime}, }; @@ -303,7 +305,31 @@ pub struct TenantConfig { pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, - pub switch_to_aux_file_v2: Option, + pub switch_aux_file_policy: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum AuxFilePolicy { + V1, + V2, + CrossValidation, +} + +impl FromStr for AuxFilePolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let s = s.to_lowercase(); + if s == "v1" { + Ok(Self::V1) + } else if s == "v2" { + Ok(Self::V2) + } else if s == "crossvalidation" || s == "cross_validation" { + Ok(Self::CrossValidation) + } else { + anyhow::bail!("cannot parse {} to aux file policy", s) + } + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs new file mode 100644 index 000000000000..fc1f10e7345f --- /dev/null +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -0,0 +1,6 @@ +use utils::id::TimelineId; + +#[derive(Default, serde::Serialize)] +pub struct AncestorDetached { + pub reparented_timelines: Vec, +} diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c0b89cee2aaf..c3d6c75e20da 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -27,7 +27,7 @@ use aws_config::{ }; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, @@ -75,13 +75,13 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result { + pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region.clone()); @@ -113,6 +113,38 @@ impl S3Bucket { // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -120,42 +152,36 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); + + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); + + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); - - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } - - let client = Client::from_conf(config_builder.build()); - - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), - upload_storage_class: aws_config.upload_storage_class.clone(), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), timeout, }) } diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index 0bf5664f475f..27378c69fc1c 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -3,7 +3,7 @@ //! # Example //! //! ``` -//! # tokio_test::block_on(async { +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { //! use utils::poison::Poison; //! use std::time::Duration; //! diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 906302e46e46..bbc366340260 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -50,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr { } } +extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).update_donor(&mut (*donor), donor_lsn) + } +} + extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz { unsafe { let callback_data = (*(*wp).config).callback_data; @@ -391,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api { get_shmem_state: Some(get_shmem_state), start_streaming: Some(start_streaming), get_flush_rec_ptr: Some(get_flush_rec_ptr), + update_donor: Some(update_donor), get_current_timestamp: Some(get_current_timestamp), conn_error_message: Some(conn_error_message), conn_status: Some(conn_status), @@ -421,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api { } } +pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { + let empty_feedback = crate::bindings::PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + + crate::bindings::WalproposerShmemState { + propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, + donor_name: [0; 64], + donor_conninfo: [0; 1024], + donor_lsn: 0, + mutex: 0, + mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 }, + backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 }, + currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, + } +} + impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{:?}", self) diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 14cc3e05a27d..fb815607a747 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,8 +1,5 @@ use std::ffi::CString; -use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{id::TenantTimelineId, lsn::Lsn}; - use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ @@ -10,6 +7,8 @@ use crate::{ WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }, }; +use postgres_ffi::WAL_SEGMENT_SIZE; +use utils::{id::TenantTimelineId, lsn::Lsn}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. @@ -28,6 +27,10 @@ pub trait ApiImpl { todo!() } + fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) { + todo!() + } + fn get_current_timestamp(&self) -> i64 { todo!() } @@ -274,6 +277,7 @@ mod tests { sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; + use std::cell::UnsafeCell; use utils::id::TenantTimelineId; use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; @@ -297,6 +301,8 @@ mod tests { replies_ptr: AtomicUsize, // channel to send LSN to the main thread sync_channel: std::sync::mpsc::SyncSender, + // Shmem state, used for storing donor info + shmem: UnsafeCell, } impl MockImpl { @@ -327,11 +333,22 @@ mod tests { } impl ApiImpl for MockImpl { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { + self.shmem.get() + } + fn get_current_timestamp(&self) -> i64 { println!("get_current_timestamp"); 0 } + fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + shmem.donor_lsn = donor_lsn; + } + fn conn_status( &self, _: &mut crate::bindings::Safekeeper, @@ -507,6 +524,7 @@ mod tests { ], replies_ptr: AtomicUsize::new(0), sync_channel: sender, + shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), }); let config = crate::walproposer::Config { ttid, diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c0023..1d02aa7709bb 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,7 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 137b93055a6d..20e9cf21966f 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -24,7 +24,9 @@ use tracing::{debug, info}; use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with}; +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::interface::*; use utils::lsn::Lsn; @@ -104,7 +106,13 @@ pub async fn compact_tiered( ctx, ) .await?; - if target_file_size == u64::MAX { + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); break; } current_level_no += 1; @@ -535,7 +543,10 @@ where } } // Open stream - let key_value_stream = std::pin::pin!(merge_delta_keys::(deltas.as_slice(), ctx)); + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); let mut new_jobs = Vec::new(); // Slide a window through the keyspace diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 1b80373ba70b..06454ee1d050 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -9,10 +9,12 @@ use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use std::collections::BinaryHeap; use std::collections::VecDeque; +use std::fmt::Display; use std::future::Future; use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{ready, Poll}; +use utils::lsn::Lsn; pub fn keyspace_total_size( keyspace: &CompactionKeySpace, @@ -108,17 +110,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( } } +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { - fn key(&self) -> E::Key { + fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), Self::Unloaded(dl) => dl.key_range().start, } } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } } impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { fn partial_cmp(&self, other: &Self) -> Option { @@ -128,12 +153,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap - other.key().cmp(&self.key()) + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { fn eq(&self, other: &Self) -> bool { - self.key().eq(&other.key()) + self.cmp(other) == std::cmp::Ordering::Equal } } impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} @@ -214,7 +239,7 @@ pub struct KeySize { pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream, E>> where - K: Eq, + K: Eq + PartialOrd + Display + Copy, I: Stream>, D: CompactionDeltaEntry<'a, K>, { @@ -229,12 +254,15 @@ where num_values: 1, size: first.size(), }; + let mut last_key = accum.key; while let Some(this) = input.next().await { let this = this?; if this.key() == accum.key { accum.size += this.size(); accum.num_values += 1; } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; yield accum; accum = KeySize { key: this.key(), @@ -243,6 +271,7 @@ where }; } } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); yield accum; } } diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index 1cea2a20e1dd..7aa20e6863b4 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -1,5 +1,20 @@ +use once_cell::sync::OnceCell; use pageserver_compaction::interface::CompactionLayer; use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} /// Test the extreme case that there are so many updates for a single key that /// even if we produce an extremely narrow delta layer, spanning just that one @@ -11,13 +26,14 @@ use pageserver_compaction::simulator::MockTimeline; #[ignore] #[tokio::test] async fn test_many_updates_for_single_key() { + setup_logging(); let mut executor = MockTimeline::new(); - executor.target_file_size = 10_000_000; // 10 MB + executor.target_file_size = 1_000_000; // 1 MB - // Ingest 100 MB of updates to a single key. + // Ingest 10 MB of updates to a single key. for _ in 1..1000 { executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); - executor.ingest_uniform(10_000, 10, &(0..1)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); executor.compact().await.unwrap(); } @@ -33,3 +49,26 @@ async fn test_many_updates_for_single_key() { } } } + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 9a556cb3d4f4..d8082f8ab4cf 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -28,6 +28,8 @@ //! # From an `index_part.json` in S3 //! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg //! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! //! ## Viewing @@ -48,7 +50,7 @@ //! ``` //! -use anyhow::Result; +use anyhow::{Context, Result}; use pageserver::repository::Key; use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; @@ -90,6 +92,33 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin struct Layer { @@ -99,8 +128,29 @@ pub fn main() -> Result<()> { } let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); @@ -117,8 +167,9 @@ pub fn main() -> Result<()> { } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + for Layer { key_range: keyr, lsn_range: lsnr, @@ -131,6 +182,8 @@ pub fn main() -> Result<()> { lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -144,10 +197,13 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + for Layer { filename, key_range: keyr, @@ -169,7 +225,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -189,7 +244,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -200,6 +255,26 @@ pub fn main() -> Result<()> { .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e557291489..0d010eb009be 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use anyhow::Context; use camino::Utf8PathBuf; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; use utils::lsn::Lsn; @@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; #[derive(serde::Serialize)] struct Output<'a> { - layer_metadata: &'a HashMap, + layer_metadata: &'a HashMap, disk_consistent_lsn: Lsn, timeline_metadata: &'a TimelineMetadata, } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index a343acaf7a30..a26ed84a0d7d 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -1,3 +1,4 @@ +use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; use tracing::warn; @@ -61,6 +62,84 @@ pub fn encode_aux_file_key(path: &str) -> Key { } } +const AUX_FILE_ENCODING_VERSION: u8 = 0x01; + +pub fn decode_file_value(val: &[u8]) -> anyhow::Result> { + let mut ptr = val; + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = &ptr[..key_len]; + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = &ptr[..val_len]; + ptr.advance(val_len); + + let path = std::str::from_utf8(key)?; + files.push((path, content)); + } + Ok(files) +} + +/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference +/// to the original value slice. Be cautious about memory consumption. +pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result> { + let mut ptr = val.clone(); + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = ptr.slice(..key_len); + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = ptr.slice(..val_len); + ptr.advance(val_len); + + let path = std::str::from_utf8(&key)?.to_string(); + files.push((path, content)); + } + Ok(files) +} + +pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { + if files.is_empty() { + // no files = empty value + return Ok(Vec::new()); + } + let mut encoded = vec![]; + encoded.put_u8(AUX_FILE_ENCODING_VERSION); + for (path, content) in files { + if path.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds path size limit", path); + } + encoded.put_u32(path.len() as u32); + encoded.put_slice(path.as_bytes()); + if content.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds content size limit", path); + } + encoded.put_u32(content.len() as u32); + encoded.put_slice(content); + } + Ok(encoded) +} + #[cfg(test)] mod tests { use super::*; @@ -109,4 +188,21 @@ mod tests { encode_aux_file_key("other_file_not_supported").to_string() ); } + + #[test] + fn test_value_encoding() { + let files = vec![ + ("pg_logical/1.file", "1111".as_bytes()), + ("pg_logical/2.file", "2222".as_bytes()), + ]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + let files = vec![]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 58b18dae7d97..dca15108101d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -601,7 +601,7 @@ where // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { + if self.timeline.is_ancestor_lsn(self.lsn) { write!(zenith_signal, "PREV LSN: none") .map_err(|e| BasebackupError::Server(e.into()))?; } else { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1345223a43ac..eb4b8bb8bbd9 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,6 +3,7 @@ //! Main entry point for the Page Server executable. use std::env::{var, VarError}; +use std::io::Read; use std::sync::Arc; use std::time::Duration; use std::{env, ops::ControlFlow, str::FromStr}; @@ -151,37 +152,34 @@ fn initialize_config( workdir: &Utf8Path, ) -> anyhow::Result> { let init = arg_matches.get_flag("init"); - let update_config = init || arg_matches.get_flag("update-config"); - let (mut toml, config_file_exists) = if cfg_file_path.is_file() { - if init { - anyhow::bail!( - "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it", - ); + let file_contents: Option = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { + if init { + anyhow::bail!("config file already exists: {cfg_file_path}"); + } + let md = f.metadata().context("stat config file")?; + if md.is_file() { + let mut s = String::new(); + f.read_to_string(&mut s).context("read config file")?; + Some(s.parse().context("parse config file toml")?) + } else { + anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, + Err(e) => { + anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(cfg_file_path) - .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?; - ( - cfg_file_contents - .parse::() - .with_context(|| { - format!("Failed to parse '{cfg_file_path}' as pageserver config") - })?, - true, - ) - } else if cfg_file_path.exists() { - anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file"); - } else { - // We're initializing the tenant, so there's no config file yet - ( - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")?, - false, - ) }; + let mut effective_config = file_contents.unwrap_or_else(|| { + DEFAULT_CONFIG_FILE + .parse() + .expect("unit tests ensure this works") + }); + + // Patch with overrides from the command line if let Some(values) = arg_matches.get_many::("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -189,22 +187,21 @@ fn initialize_config( })?; for (key, item) in doc.iter() { - if config_file_exists && update_config && key == "id" && toml.contains_key(key) { - anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden"); - } - toml.insert(key, item.clone()); + effective_config.insert(key, item.clone()); } } } - debug!("Resulting toml: {toml}"); - let conf = PageServerConf::parse_and_validate(&toml, workdir) + debug!("Resulting toml: {effective_config}"); + + // Construct the runtime representation + let conf = PageServerConf::parse_and_validate(&effective_config, workdir) .context("Failed to parse pageserver configuration")?; - if update_config { + if init { info!("Writing pageserver config to '{cfg_file_path}'"); - std::fs::write(cfg_file_path, toml.to_string()) + std::fs::write(cfg_file_path, effective_config.to_string()) .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; info!("Config successfully written to '{cfg_file_path}'") } @@ -758,18 +755,13 @@ fn cli() -> Command { // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") + .long("config-override") .short('c') .num_args(1) .action(ArgAction::Append) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) - .arg( - Arg::new("update-config") - .long("update-config") - .action(ArgAction::SetTrue) - .help("Update the config file when started"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e3c11cb29963..c937309d8332 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -38,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -479,7 +479,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +511,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -734,20 +734,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{harness::TenantHarness, storage_layer::DeltaFileName}, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -797,7 +797,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -952,7 +952,7 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac221..ae3b2c918082 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 6248424cee49..ebeb8bbb205e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -64,7 +64,7 @@ use crate::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, }; @@ -540,7 +540,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( js.spawn(async move { layer .secondary_tenant - .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name) + .evict_layer( + tenant_manager.get_conf(), + layer.timeline_id, + layer.name, + layer.metadata, + ) .await; Ok(file_size) }); @@ -599,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( pub(crate) struct EvictionSecondaryLayer { pub(crate) secondary_tenant: Arc, pub(crate) timeline_id: TimelineId, - pub(crate) name: LayerFileName, + pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, } @@ -632,9 +637,9 @@ impl EvictionLayer { } } - pub(crate) fn get_name(&self) -> LayerFileName { + pub(crate) fn get_name(&self) -> LayerName { match self { - Self::Attached(l) => l.layer_desc().filename(), + Self::Attached(l) => l.layer_desc().layer_name(), Self::Secondary(sl) => sl.name.clone(), } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index c425f3e628fa..36c74ed140f9 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -420,25 +420,6 @@ paths: description: Tenant scheduled to load successfully /v1/tenant/{tenant_id}/synthetic_size: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: | - Calculate tenant's synthetic size - responses: - "200": - description: Tenant's synthetic size - content: - application/json: - schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - - # This route has no handler. TODO: remove? - /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id in: path @@ -468,19 +449,9 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + description: SVG representation of the tenant and it's timelines. "401": description: Unauthorized Error content: @@ -929,6 +900,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cf526940f432..a8ca642dc59a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -63,6 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; use crate::tenant::SpawnMode; @@ -1228,13 +1229,15 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let downloaded = timeline - .download_layer(layer_file_name) + .download_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1258,11 +1261,14 @@ async fn evict_timeline_layer_handler( let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let evicted = timeline - .evict_layer(layer_file_name) + .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1827,6 +1833,75 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn timeline_detach_ancestor_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + use crate::tenant::timeline::detach_ancestor::Options; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + + async move { + let mut options = Options::default(); + + let rewrite_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; + let copy_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?; + + [ + (&mut options.rewrite_concurrency, rewrite_concurrency), + (&mut options.copy_concurrency, copy_concurrency), + ] + .into_iter() + .filter_map(|(target, val)| val.map(|val| (target, val))) + .for_each(|(target, val)| *target = val); + + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); + let ctx = &ctx; + + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(|e| ApiError::NotFound(e.into()))?; + + let (_guard, prepared) = timeline + .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + let res = state + .tenant_manager + .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) + .await; + + match res { + Ok(reparented_timelines) => { + let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + reparented_timelines, + }; + + json_response(StatusCode::OK, resp) + } + Err(e) => Err(ApiError::InternalServerError( + e.context("timeline detach completion"), + )), + } + } + .instrument(span) + .await +} + async fn deletion_queue_flush( r: Request, cancel: CancellationToken, @@ -2515,6 +2590,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor", + |r| api_handler(r, timeline_detach_ancestor_handler), + ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 903bad34ccd9..256f2f334c1d 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1512,29 +1512,80 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy }); pub(crate) struct TenantManagerMetrics { - pub(crate) tenant_slots: UIntGauge, + tenant_slots_attached: UIntGauge, + tenant_slots_secondary: UIntGauge, + tenant_slots_inprogress: UIntGauge, pub(crate) tenant_slot_writes: IntCounter, pub(crate) unexpected_errors: IntCounter, } +impl TenantManagerMetrics { + /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects + /// exactly: they track the lifetime of the slots _in the tenant map_. + pub(crate) fn slot_inserted(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.inc(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.inc(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.inc(); + } + } + } + + pub(crate) fn slot_removed(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.dec(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.dec(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.dec(); + } + } + } + + #[cfg(all(debug_assertions, not(test)))] + pub(crate) fn slots_total(&self) -> u64 { + self.tenant_slots_attached.get() + + self.tenant_slots_secondary.get() + + self.tenant_slots_inprogress.get() + } +} + pub(crate) static TENANT_MANAGER: Lazy = Lazy::new(|| { - TenantManagerMetrics { - tenant_slots: register_uint_gauge!( + let tenant_slots = register_uint_gauge_vec!( "pageserver_tenant_manager_slots", "How many slots currently exist, including all attached, secondary and in-progress operations", + &["mode"] ) - .expect("failed to define a metric"), - tenant_slot_writes: register_int_counter!( - "pageserver_tenant_manager_slot_writes", - "Writes to a tenant slot, including all of create/attach/detach/delete" - ) - .expect("failed to define a metric"), - unexpected_errors: register_int_counter!( - "pageserver_tenant_manager_unexpected_errors_total", - "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." - ) - .expect("failed to define a metric"), -} + .expect("failed to define a metric"); + TenantManagerMetrics { + tenant_slots_attached: tenant_slots + .get_metric_with_label_values(&["attached"]) + .unwrap(), + tenant_slots_secondary: tenant_slots + .get_metric_with_label_values(&["secondary"]) + .unwrap(), + tenant_slots_inprogress: tenant_slots + .get_metric_with_label_values(&["inprogress"]) + .unwrap(), + tenant_slot_writes: register_int_counter!( + "pageserver_tenant_manager_slot_writes", + "Writes to a tenant slot, including all of create/attach/detach/delete" + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_tenant_manager_unexpected_errors_total", + "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." + ) + .expect("failed to define a metric"), + } }); pub(crate) struct DeletionQueueMetrics { @@ -2275,6 +2326,7 @@ use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +use crate::tenant::mgr::TenantSlot; /// Maintain a per timeline gauge in addition to the global gauge. struct PerTimelineRemotePhysicalSizeGauge { @@ -2877,6 +2929,8 @@ pub fn preinitialize_metrics() { &WALRECEIVER_CANDIDATES_REMOVED, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, + &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + &REMOTE_ONDEMAND_DOWNLOADED_BYTES, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 12314c596143..a4215ee107b2 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::WAL_INGEST; -use crate::repository::*; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; +use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; @@ -24,6 +24,7 @@ use pageserver_api::key::{ AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -670,7 +671,7 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub(crate) async fn list_aux_files( + async fn list_aux_files_v1( &self, lsn: Lsn, ctx: &RequestContext, @@ -688,6 +689,63 @@ impl Timeline { } } + async fn list_aux_files_v2( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .await + .context("scan")?; + let mut result = HashMap::new(); + for (_, v) in kv { + let v = v.context("get value")?; + let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; + for (fname, content) in v { + result.insert(fname, content); + } + } + Ok(result) + } + + pub(crate) async fn list_aux_files( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + match self.get_switch_aux_file_policy() { + AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await, + AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await, + AuxFilePolicy::CrossValidation => { + let v1_result = self.list_aux_files_v1(lsn, ctx).await; + let v2_result = self.list_aux_files_v2(lsn, ctx).await; + match (v1_result, v2_result) { + (Ok(v1), Ok(v2)) => { + if v1 != v2 { + tracing::error!( + "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" + ); + return Err(PageReconstructError::Other(anyhow::anyhow!( + "unmatched aux file v1 v2 result" + ))); + } + Ok(v1) + } + (Ok(_), Err(v2)) => { + tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); + Err(v2) + } + (Err(v1), Ok(_)) => { + tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); + Err(v1) + } + (Err(_), Err(v2)) => Err(v2), + } + } + } + } + /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// @@ -1389,6 +1447,9 @@ impl<'a> DatadirModification<'a> { } pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { + return Ok(()); + } let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { files: HashMap::new(), })?; @@ -1404,89 +1465,121 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let file_path = path.to_string(); - let content = if content.is_empty() { - None - } else { - Some(Bytes::copy_from_slice(content)) - }; + let policy = self.tline.get_switch_aux_file_policy(); + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), + }; + let files = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? + } else { + Vec::new() + }; + let new_files = if content.is_empty() { + files + .into_iter() + .filter(|(p, _)| &path != p) + .collect::>() + } else { + files + .into_iter() + .filter(|(p, _)| &path != p) + .chain(std::iter::once((path, content))) + .collect::>() + }; + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); + } - let n_files; - let mut aux_files = self.tline.aux_files.lock().await; - if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value. - dir.upsert(file_path.clone(), content.clone()); - n_files = dir.files.len(); - if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - aux_files.n_deltas = 0; + if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { + let file_path = path.to_string(); + let content = if content.is_empty() { + None } else { - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), - ); - aux_files.n_deltas += 1; - } - aux_files.dir = Some(dir); - } else { - // Check if the AUX_FILES_KEY is initialized - match self.get(AUX_FILES_KEY, ctx).await { - Ok(dir_bytes) => { - let mut dir = AuxFilesDirectory::des(&dir_bytes)?; - // Key is already set, we may append a delta - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { - file_path: file_path.clone(), - content: content.clone(), - }), - ); - dir.upsert(file_path, content); - n_files = dir.files.len(); - aux_files.dir = Some(dir); - } - Err( - e @ (PageReconstructError::AncestorStopping(_) - | PageReconstructError::Cancelled - | PageReconstructError::AncestorLsnTimeout(_)), - ) => { - // Important that we do not interpret a shutdown error as "not found" and thereby - // reset the map. - return Err(e.into()); - } - // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but - // the original code assumes all other errors are missing keys. Therefore, we keep the code path - // the same for now, though in theory, we should only match the `MissingKey` variant. - Err( - PageReconstructError::Other(_) - | PageReconstructError::WalRedo(_) - | PageReconstructError::MissingKey { .. }, - ) => { - // Key is missing, we must insert an image as the basis for subsequent deltas. - - let mut dir = AuxFilesDirectory { - files: HashMap::new(), - }; - dir.upsert(file_path, content); + Some(Bytes::copy_from_slice(content)) + }; + + let n_files; + let mut aux_files = self.tline.aux_files.lock().await; + if let Some(mut dir) = aux_files.dir.take() { + // We already updated aux files in `self`: emit a delta and update our latest value. + dir.upsert(file_path.clone(), content.clone()); + n_files = dir.files.len(); + if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { self.put( AUX_FILES_KEY, Value::Image(Bytes::from( AuxFilesDirectory::ser(&dir).context("serialize")?, )), ); - n_files = 1; - aux_files.dir = Some(dir); + aux_files.n_deltas = 0; + } else { + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), + ); + aux_files.n_deltas += 1; + } + aux_files.dir = Some(dir); + } else { + // Check if the AUX_FILES_KEY is initialized + match self.get(AUX_FILES_KEY, ctx).await { + Ok(dir_bytes) => { + let mut dir = AuxFilesDirectory::des(&dir_bytes)?; + // Key is already set, we may append a delta + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { + file_path: file_path.clone(), + content: content.clone(), + }), + ); + dir.upsert(file_path, content); + n_files = dir.files.len(); + aux_files.dir = Some(dir); + } + Err( + e @ (PageReconstructError::AncestorStopping(_) + | PageReconstructError::Cancelled + | PageReconstructError::AncestorLsnTimeout(_)), + ) => { + // Important that we do not interpret a shutdown error as "not found" and thereby + // reset the map. + return Err(e.into()); + } + // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but + // the original code assumes all other errors are missing keys. Therefore, we keep the code path + // the same for now, though in theory, we should only match the `MissingKey` variant. + Err( + PageReconstructError::Other(_) + | PageReconstructError::WalRedo(_) + | PageReconstructError::MissingKey { .. }, + ) => { + // Key is missing, we must insert an image as the basis for subsequent deltas. + + let mut dir = AuxFilesDirectory { + files: HashMap::new(), + }; + dir.upsert(file_path, content); + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + n_files = 1; + aux_files.dir = Some(dir); + } } } - } - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, n_files)); + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, n_files)); + } Ok(()) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 0a9ac50aad1e..7b30c3ecf75a 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -33,7 +33,6 @@ impl Value { } } -#[cfg(test)] #[derive(Debug, PartialEq)] pub(crate) enum InvalidInput { TooShortValue, @@ -42,10 +41,8 @@ pub(crate) enum InvalidInput { /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets /// use this type for querying if a slice looks some particular way. -#[cfg(test)] pub(crate) struct ValueBytes; -#[cfg(test)] impl ValueBytes { pub(crate) fn will_init(raw: &[u8]) -> Result { if raw.len() < 12 { diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0c245580eeda..5f46ce3d69b7 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -319,6 +319,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) + IngestHousekeeping, + /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, @@ -367,6 +370,8 @@ pub enum TaskKind { #[cfg(test)] UnitTest, + + DetachAncestor, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fdc49ae29553..010e56a899db 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -322,6 +322,9 @@ pub struct Tenant { /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. pub(crate) timeline_get_throttle: Arc>, + + /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + ongoing_timeline_detach: std::sync::Mutex>, } impl std::fmt::Debug for Tenant { @@ -1676,6 +1679,34 @@ impl Tenant { Ok(()) } + // Call through to all timelines to freeze ephemeral layers if needed. Usually + // this happens during ingest: this background housekeeping is for freezing layers + // that are open but haven't been written to for some time. + async fn ingest_housekeeping(&self) { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = { + self.timelines + .lock() + .unwrap() + .values() + .filter_map(|timeline| { + if timeline.is_active() { + Some(timeline.clone()) + } else { + None + } + }) + .collect::>() + }; + + for timeline in &timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -2529,6 +2560,7 @@ impl Tenant { &crate::metrics::tenant_throttling::TIMELINE_GET, )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), + ongoing_timeline_detach: std::sync::Mutex::default(), } } @@ -3726,7 +3758,7 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), - switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2), + switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 9975c9edbc9d..a743ce3c16a3 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,6 +9,7 @@ //! may lead to a data loss. //! use anyhow::bail; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::EvictionPolicy; use pageserver_api::models::{self, ThrottleConfig}; @@ -370,9 +371,9 @@ pub struct TenantConf { // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, - /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. - pub switch_to_aux_file_v2: bool, + pub switch_aux_file_policy: AuxFilePolicy, } /// Same as TenantConf, but this struct preserves the information about @@ -471,7 +472,7 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] - pub switch_to_aux_file_v2: Option, + pub switch_aux_file_policy: Option, } impl TenantConfOpt { @@ -529,9 +530,9 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), - switch_to_aux_file_v2: self - .switch_to_aux_file_v2 - .unwrap_or(global_conf.switch_to_aux_file_v2), + switch_aux_file_policy: self + .switch_aux_file_policy + .unwrap_or(global_conf.switch_aux_file_policy), } } } @@ -573,7 +574,7 @@ impl Default for TenantConf { lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_to_aux_file_v2: false, + switch_aux_file_policy: AuxFilePolicy::V1, } } } @@ -648,7 +649,7 @@ impl From for models::TenantConfig { lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, - switch_to_aux_file_v2: value.switch_to_aux_file_v2, + switch_aux_file_policy: value.switch_aux_file_policy, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 33d0f677e56f..2e5259bfe200 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -585,9 +585,20 @@ impl DeleteTenantFlow { // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - crate::metrics::TENANT_MANAGER - .tenant_slots - .set(locked.len() as u64); + + // Update stats + match &removed { + TenantsMapRemoveResult::Occupied(slot) => { + crate::metrics::TENANT_MANAGER.slot_removed(slot); + } + TenantsMapRemoveResult::InProgress(barrier) => { + crate::metrics::TENANT_MANAGER + .slot_removed(&TenantSlot::InProgress(barrier.clone())); + } + TenantsMapRemoveResult::Vacant => { + // Nothing changed in map, no metric update + } + } match removed { TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 39da71347925..fc71ea764254 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -207,6 +207,24 @@ impl TimelineMetadata { self.body.ancestor_lsn } + /// When reparenting, the `ancestor_lsn` does not change. + pub fn reparent(&mut self, timeline: &TimelineId) { + assert!(self.body.ancestor_timeline.is_some()); + // no assertion for redoing this: it's fine, we may have to repeat this multiple times over + self.body.ancestor_timeline = Some(*timeline); + } + + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { + if let Some(ancestor) = self.body.ancestor_timeline { + assert_eq!(ancestor, branchpoint.0); + } + if self.body.ancestor_lsn != Lsn(0) { + assert_eq!(self.body.ancestor_lsn, branchpoint.1); + } + self.body.ancestor_timeline = None; + self.body.ancestor_lsn = Lsn(0); + } + pub fn latest_gc_cutoff_lsn(&self) -> Lsn { self.body.latest_gc_cutoff_lsn } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 006d501daa6f..6be66e99adc4 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -56,6 +56,7 @@ use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::PreparedTimelineDetach; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -246,6 +247,7 @@ impl TenantsMap { } } + #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { TenantsMap::Initializing => 0, @@ -746,6 +748,7 @@ pub async fn init_tenant_mgr( } }; + METRICS.slot_inserted(&slot); tenants.insert(tenant_shard_id, slot); } @@ -753,7 +756,7 @@ pub async fn init_tenant_mgr( let mut tenants_map = TENANTS.write().unwrap(); assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); - METRICS.tenant_slots.set(tenants.len() as u64); + *tenants_map = TenantsMap::Open(tenants); Ok(TenantManager { @@ -824,6 +827,14 @@ fn tenant_spawn( async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { let mut join_set = JoinSet::new(); + #[cfg(all(debug_assertions, not(test)))] + { + // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, + // as it happens implicitly at the end of tests etc. + let m = tenants.read().unwrap(); + debug_assert_eq!(METRICS.slots_total(), m.len() as u64); + } + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. let (total_in_progress, total_attached) = { let mut m = tenants.write().unwrap(); @@ -1997,6 +2008,101 @@ impl TenantManager { }) .collect()) } + + /// Completes an earlier prepared timeline detach ancestor. + pub(crate) async fn complete_detaching_timeline_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + prepared: PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + struct RevertOnDropSlot(Option); + + impl Drop for RevertOnDropSlot { + fn drop(&mut self) { + if let Some(taken) = self.0.take() { + taken.revert(); + } + } + } + + impl RevertOnDropSlot { + fn into_inner(mut self) -> SlotGuard { + self.0.take().unwrap() + } + } + + impl std::ops::Deref for RevertOnDropSlot { + type Target = SlotGuard; + + fn deref(&self) -> &Self::Target { + self.0.as_ref().unwrap() + } + } + + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let slot_guard = RevertOnDropSlot(Some(slot_guard)); + + let tenant = { + let Some(old_slot) = slot_guard.get_old_value() else { + anyhow::bail!( + "Tenant not found when trying to complete detaching timeline ancestor" + ); + }; + + let Some(tenant) = old_slot.get_attached() else { + anyhow::bail!("Tenant is not in attached state"); + }; + + if !tenant.is_active() { + anyhow::bail!("Tenant is not active"); + } + + tenant.clone() + }; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let reparented = timeline + .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + .await?; + + let mut slot_guard = slot_guard.into_inner(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } + } + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + self.tenants, + SpawnMode::Eager, + ctx, + )?; + + slot_guard.upsert(TenantSlot::Attached(tenant))?; + + Ok(reparented) + } } #[derive(Debug, thiserror::Error)] @@ -2428,10 +2534,13 @@ impl SlotGuard { TenantsMap::Open(m) => m, }; + METRICS.slot_inserted(&new_value); + let replaced = m.insert(self.tenant_shard_id, new_value); self.upserted = true; - - METRICS.tenant_slots.set(m.len() as u64); + if let Some(replaced) = replaced.as_ref() { + METRICS.slot_removed(replaced); + } replaced }; @@ -2541,9 +2650,13 @@ impl Drop for SlotGuard { } if self.old_value_is_shutdown() { + METRICS.slot_removed(entry.get()); entry.remove(); } else { - entry.insert(self.old_value.take().unwrap()); + let inserting = self.old_value.take().unwrap(); + METRICS.slot_inserted(&inserting); + let replaced = entry.insert(inserting); + METRICS.slot_removed(&replaced); } } Entry::Vacant(_) => { @@ -2554,8 +2667,6 @@ impl Drop for SlotGuard { ); } } - - METRICS.tenant_slots.set(m.len() as u64); } } @@ -2635,7 +2746,9 @@ fn tenant_map_acquire_slot_impl( } _ => { let (completion, barrier) = utils::completion::channel(); - v.insert(TenantSlot::InProgress(barrier)); + let inserting = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&inserting); + v.insert(inserting); tracing::debug!("Vacant, inserted InProgress"); Ok(SlotGuard::new(*tenant_shard_id, None, completion)) } @@ -2671,7 +2784,10 @@ fn tenant_map_acquire_slot_impl( _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); - let old_value = o.insert(TenantSlot::InProgress(barrier)); + let in_progress = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&in_progress); + let old_value = o.insert(in_progress); + METRICS.slot_removed(&old_value); tracing::debug!("Occupied, replaced with InProgress"); Ok(SlotGuard::new( *tenant_shard_id, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index a54e93c96b96..9103760388ce 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -240,7 +240,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; use super::metadata::MetadataUpdate; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; @@ -437,6 +437,19 @@ impl RemoteTimelineClient { } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + // technically this is a dirty read, but given how timeline detach ancestor is implemented + // via tenant restart, the lineage has always been uploaded. + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -503,7 +516,7 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, cancel: &CancellationToken, ctx: &RequestContext, @@ -570,7 +583,7 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.latest_metadata = metadata.clone(); - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue); Ok(()) } @@ -591,7 +604,7 @@ impl RemoteTimelineClient { upload_queue.latest_metadata.apply(update); - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue); Ok(()) } @@ -611,18 +624,14 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload( - self: &Arc, - upload_queue: &mut UploadQueueInitialized, - metadata: TimelineMetadata, - ) { + fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); info!( @@ -631,12 +640,8 @@ impl RemoteTimelineClient { upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); + let index_part = IndexPart::from(&*upload_queue); + let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; @@ -645,9 +650,67 @@ impl RemoteTimelineClient { self.launch_queued_tasks(upload_queue); } + pub(crate) async fn schedule_reparenting_and_wait( + self: &Arc, + new_parent: &TimelineId, + ) -> anyhow::Result<()> { + // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing + // and reads the in-memory part we cannot do the detaching like this + let receiver = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + + upload_queue.latest_metadata.reparent(new_parent); + upload_queue.latest_lineage.record_previous_ancestor(&prev); + + self.schedule_index_upload(upload_queue); + + self.schedule_barrier0(upload_queue) + }; + + Self::wait_completion0(receiver).await + } + + /// Schedules uploading a new version of `index_part.json` with the given layers added, + /// detaching from ancestor and waits for it to complete. /// - /// Launch an upload operation in the background. - /// + /// This is used with `Timeline::detach_ancestor` functionality. + pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait( + self: &Arc, + layers: &[Layer], + adopted: (TimelineId, Lsn), + ) -> anyhow::Result<()> { + let barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.latest_metadata.detach_from_ancestor(&adopted); + upload_queue.latest_lineage.record_detaching(&adopted); + + for layer in layers { + upload_queue + .latest_files + .insert(layer.layer_desc().layer_name(), layer.metadata()); + } + + self.schedule_index_upload(upload_queue); + + let barrier = self.schedule_barrier0(upload_queue); + self.launch_queued_tasks(upload_queue); + barrier + }; + + Self::wait_completion0(barrier).await + } + + /// Launch an upload operation in the background; the file is added to be included in next + /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, @@ -669,13 +732,15 @@ impl RemoteTimelineClient { upload_queue .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; info!( - "scheduled layer file upload {layer} gen={:?} shard={:?}", - metadata.generation, metadata.shard + gen=?metadata.generation, + shard=?metadata.shard, + "scheduled layer file upload {layer}", ); + let op = UploadOp::UploadLayer(layer, metadata); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); @@ -691,7 +756,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -719,7 +784,7 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); @@ -734,14 +799,10 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> Vec<(LayerName, LayerFileMetadata)> where - I: IntoIterator, + I: IntoIterator, { - // Deleting layers doesn't affect the values stored in TimelineMetadata, - // so we don't need update it. Just serialize it. - let metadata = upload_queue.latest_metadata.clone(); - // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata // is later used when physically deleting layers, to construct key paths. @@ -780,7 +841,7 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, metadata); + self.schedule_index_upload(upload_queue); } with_metadata @@ -790,7 +851,7 @@ impl RemoteTimelineClient { /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -803,7 +864,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -872,7 +933,7 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -882,12 +943,18 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled uploads/deletions to complete pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { - let mut receiver = { + let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_barrier0(upload_queue) }; + Self::wait_completion0(receiver).await + } + + async fn wait_completion0( + mut receiver: tokio::sync::watch::Receiver<()>, + ) -> anyhow::Result<()> { if receiver.changed().await.is_err() { anyhow::bail!("wait_completion aborted because upload queue was stopped"); } @@ -1003,8 +1070,7 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion) - .context("IndexPart serialize")?; + let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion); index_part.deleted_at = Some(deleted_at); index_part }; @@ -1085,6 +1151,93 @@ impl RemoteTimelineClient { Ok(()) } + /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload. + /// + /// This is not normally needed. + pub(crate) async fn upload_layer_file( + self: &Arc, + uploaded: &ResidentLayer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &uploaded.layer_desc().layer_name(), + uploaded.metadata().generation, + ); + + backoff::retry( + || async { + upload::upload_timeline_layer( + &self.storage_impl, + uploaded.local_path(), + &remote_path, + uploaded.metadata().file_size(), + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "upload a layer without adding it to latest files", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("upload a layer without adding it to latest files") + } + + /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is + /// not added to be part of a future `index_part.json` upload. + pub(crate) async fn copy_timeline_layer( + self: &Arc, + adopted: &Layer, + adopted_as: &Layer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let source_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &adopted + .get_timeline_id() + .expect("Source timeline should be alive"), + self.tenant_shard_id.to_index(), + &adopted.layer_desc().layer_name(), + adopted.metadata().generation, + ); + + let target_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &adopted_as.layer_desc().layer_name(), + adopted_as.metadata().generation, + ); + + backoff::retry( + || async { + upload::copy_timeline_layer( + &self.storage_impl, + &source_remote_path, + &target_remote_path, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "copy timeline layer", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remote copy timeline layer") + } + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { match tokio::time::timeout( DELETION_QUEUE_FLUSH_TIMEOUT, @@ -1256,7 +1409,7 @@ impl RemoteTimelineClient { while let Some(next_op) = upload_queue.queued_operations.front() { // Can we run this task now? let can_run_now = match next_op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(..) => { // Can always be scheduled. true } @@ -1383,13 +1536,25 @@ impl RemoteTimelineClient { let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata) => { - let path = layer.local_path(); + let local_path = layer.local_path(); + + // We should only be uploading layers created by this `Tenant`'s lifetime, so + // the metadata in the upload should always match our current generation. + assert_eq!(layer_metadata.generation, self.generation); + + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + layer_metadata.shard, + &layer.layer_desc().layer_name(), + layer_metadata.generation, + ); + upload::upload_timeline_layer( - self.conf, &self.storage_impl, - path, - layer_metadata, - self.generation, + local_path, + &remote_path, + layer_metadata.file_size(), &self.cancel, ) .measure_remote_op( @@ -1665,6 +1830,7 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), + latest_lineage: initialized.latest_lineage.clone(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn @@ -1750,14 +1916,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -1818,29 +1984,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } } -/// Files on the remote storage are stored with paths, relative to the workdir. -/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. -/// -/// Errors if the path provided does not start from pageserver's workdir. -pub fn remote_path( - conf: &PageServerConf, - local_path: &Utf8Path, - generation: Generation, -) -> anyhow::Result { - let stripped = local_path - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix")?; - - let suffixed = format!("{0}{1}", stripped, generation.get_suffix()); - - RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| { - format!( - "to resolve remote part of path {:?} for base {:?}", - local_path, conf.workdir - ) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1848,6 +1991,7 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, + storage_layer::layer::local_layer_path, Tenant, Timeline, }, DEFAULT_PG_VERSION, @@ -1876,8 +2020,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -2003,7 +2147,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -2029,12 +2173,21 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { - std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap(); + .map(|(name, contents): (LayerName, Vec)| { + + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &generation, + ); + std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, + local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) @@ -2101,9 +2254,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2117,7 +2270,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2135,9 +2288,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2150,9 +2303,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2171,19 +2324,22 @@ mod tests { .. } = TestSetup::new("metrics").await.unwrap(); let client = timeline.remote_client.as_ref().unwrap(); - let timeline_path = harness.timeline_path(&TIMELINE_ID); - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &layer_file_name_1, + &harness.generation, + ); let content_1 = dummy_contents("foo"); - std::fs::write( - timeline_path.join(layer_file_name_1.file_name()), - &content_1, - ) - .unwrap(); + std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, + local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); @@ -2252,12 +2408,7 @@ mod tests { async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed - let example_metadata = TimelineMetadata::example(); - let example_index_part = IndexPart::new( - HashMap::new(), - example_metadata.disk_consistent_lsn(), - example_metadata, - ); + let example_index_part = IndexPart::example(); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b038f264f550..b464437422e0 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,7 +21,8 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; @@ -47,7 +48,7 @@ pub async fn download_layer_file<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, cancel: &CancellationToken, ctx: &RequestContext, @@ -55,7 +56,13 @@ pub async fn download_layer_file<'a>( debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = timeline_path.join(layer_file_name.file_name()); + let local_path = local_layer_path( + conf, + &tenant_shard_id, + &timeline_id, + layer_file_name, + &layer_metadata.generation, + ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 0abfdeef023e..b114d6aa1047 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -6,10 +6,10 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -76,7 +76,7 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is @@ -85,6 +85,9 @@ pub struct IndexPart { #[serde(rename = "metadata_bytes")] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, } impl IndexPart { @@ -97,22 +100,23 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + const LATEST_VERSION: usize = 5; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5]; pub const FILE_NAME: &'static str = "index_part.json"; - pub fn new( - layers_and_metadata: HashMap, + fn new( + layers_and_metadata: &HashMap, disk_consistent_lsn: Lsn, metadata: TimelineMetadata, + lineage: Lineage, ) -> Self { - // Transform LayerFileMetadata into IndexLayerMetadata let layer_metadata = layers_and_metadata - .into_iter() - .map(|(k, v)| (k, IndexLayerMetadata::from(v))) + .iter() + .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v))) .collect(); Self { @@ -121,6 +125,7 @@ impl IndexPart { disk_consistent_lsn, metadata, deleted_at: None, + lineage, } } @@ -141,20 +146,26 @@ impl IndexPart { pub fn to_s3_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } -} -impl TryFrom<&UploadQueueInitialized> for IndexPart { - type Error = SerializeError; + #[cfg(test)] + pub(crate) fn example() -> Self { + let example_metadata = TimelineMetadata::example(); + Self::new( + &HashMap::new(), + example_metadata.disk_consistent_lsn(), + example_metadata, + Default::default(), + ) + } +} - fn try_from(upload_queue: &UploadQueueInitialized) -> Result { - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let metadata = upload_queue.latest_metadata.clone(); +impl From<&UploadQueueInitialized> for IndexPart { + fn from(uq: &UploadQueueInitialized) -> Self { + let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn(); + let metadata = uq.latest_metadata.clone(); + let lineage = uq.latest_lineage.clone(); - Ok(Self::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - )) + Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage) } } @@ -172,8 +183,8 @@ pub struct IndexLayerMetadata { pub shard: ShardIndex, } -impl From for IndexLayerMetadata { - fn from(other: LayerFileMetadata) -> Self { +impl From<&LayerFileMetadata> for IndexLayerMetadata { + fn from(other: &LayerFileMetadata) -> Self { IndexLayerMetadata { file_size: other.file_size, generation: other.generation, @@ -182,8 +193,76 @@ impl From for IndexLayerMetadata { } } +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + return; + } + + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + } + + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { + assert!(self.original_ancestor.is_none()); + + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously a branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .as_ref() + .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn) + } +} + #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; #[test] @@ -219,6 +298,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -259,6 +339,7 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -300,7 +381,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + lineage: Lineage::default(), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -345,6 +427,7 @@ mod tests { ]) .unwrap(), deleted_at: None, + lineage: Lineage::default(), }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -383,11 +466,58 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 0227331953a6..caa843316f11 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -12,18 +12,13 @@ use tokio_util::sync::CancellationToken; use utils::backoff; use super::Generation; -use crate::{ - config::PageServerConf, - tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_path, - }, +use crate::tenant::remote_timeline_client::{ + index::IndexPart, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, }; -use remote_storage::{GenericRemoteStorage, TimeTravelError}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; -use super::index::LayerFileMetadata; - use tracing::info; /// Serializes and uploads the given index part data to the remote storage. @@ -65,11 +60,10 @@ pub(crate) async fn upload_index_part<'a>( /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( - conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - source_path: &'a Utf8Path, - known_metadata: &'a LayerFileMetadata, - generation: Generation, + local_path: &'a Utf8Path, + remote_path: &'a RemotePath, + metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { @@ -78,8 +72,7 @@ pub(super) async fn upload_timeline_layer<'a>( pausable_failpoint!("before-upload-layer-pausable"); - let storage_path = remote_path(conf, source_path, generation)?; - let source_file_res = fs::File::open(&source_path).await; + let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { @@ -90,34 +83,49 @@ pub(super) async fn upload_timeline_layer<'a>( // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` - info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } - Err(e) => { - Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))? - } + Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await - .with_context(|| format!("get the source file metadata for layer {source_path:?}"))? + .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); - let metadata_size = known_metadata.file_size(); if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size) - .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; + .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, &storage_path, None, cancel) + .upload(reader, fs_size, remote_path, None, cancel) + .await + .with_context(|| format!("upload layer from local path '{local_path}'")) +} + +pub(super) async fn copy_timeline_layer( + storage: &GenericRemoteStorage, + source_path: &RemotePath, + target_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + fail_point!("before-copy-layer", |_| { + bail!("failpoint before-copy-layer") + }); + + pausable_failpoint!("before-copy-layer-pausable"); + + storage + .copy_object(source_path, target_path, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'")) + .with_context(|| format!("copy layer {source_path} to {target_path}")) } /// Uploads the given `initdb` data to the remote storage. diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 5c46df268a9c..7075044baf90 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -21,8 +21,9 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, + remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName}, }; use pageserver_api::{ @@ -181,7 +182,8 @@ impl SecondaryTenant { self: &Arc, conf: &PageServerConf, timeline_id: TimelineId, - name: LayerFileName, + name: LayerName, + metadata: LayerFileMetadata, ) { debug_assert_current_span_has_tenant_id(); @@ -195,9 +197,13 @@ impl SecondaryTenant { let now = SystemTime::now(); - let path = conf - .timeline_path(&self.tenant_shard_id, &timeline_id) - .join(name.file_name()); + let local_path = local_layer_path( + conf, + &self.tenant_shard_id, + &timeline_id, + &name, + &metadata.generation, + ); let this = self.clone(); @@ -208,7 +214,7 @@ impl SecondaryTenant { // it, the secondary downloader could have seen an updated heatmap that // resulted in a layer being deleted. // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(path); + let deleted = std::fs::remove_file(local_path); let not_found = deleted .as_ref() diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index fb8907b5a83f..2a8f83be9571 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,7 +22,7 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -111,7 +111,7 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { @@ -124,10 +124,10 @@ impl OnDiskState { #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + pub(super) on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, } /// This state is written by the secondary downloader, it is opaque @@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> { let layers_in_heatmap = heatmap_timeline .layers .iter() - .map(|l| &l.name) + .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state .on_disk_layers .iter() - .map(|l| l.0) + .map(|l| (l.0, l.1.metadata.generation)) .collect::>(); let mut layer_count = layers_on_disk.len(); @@ -637,16 +637,24 @@ impl<'a> TenantDownloader<'a> { .sum(); // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { + for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { layer_count -= 1; layer_byte_count -= timeline_state .on_disk_layers - .get(layer) + .get(layer_file_name) .unwrap() .metadata .file_size(); - delete_layers.push((*timeline_id, (*layer).clone())); + let local_path = local_layer_path( + self.conf, + self.secondary_state.get_tenant_shard_id(), + timeline_id, + layer_file_name, + generation, + ); + + delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); } progress.bytes_downloaded += layer_byte_count; @@ -661,11 +669,7 @@ impl<'a> TenantDownloader<'a> { } // Execute accumulated deletions - for (timeline_id, layer_name) in delete_layers { - let timeline_path = self - .conf - .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); - let local_path = timeline_path.join(layer_name.to_string()); + for (timeline_id, layer_name, local_path) in delete_layers { tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); tokio::fs::remove_file(&local_path) @@ -754,9 +758,6 @@ impl<'a> TenantDownloader<'a> { ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - let timeline_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id); // Accumulate updates to the state let mut touched = Vec::new(); @@ -806,10 +807,14 @@ impl<'a> TenantDownloader<'a> { if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. - let local_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id) - .join(layer.name.file_name()); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); + match tokio::fs::metadata(&local_path).await { Ok(meta) => { tracing::debug!( @@ -903,7 +908,13 @@ impl<'a> TenantDownloader<'a> { }; if downloaded_bytes != layer.metadata.file_size { - let local_path = timeline_path.join(layer.name.to_string()); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); tracing::warn!( "Downloaded layer {} with unexpected size {} != {}. Removing download.", @@ -986,7 +997,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -1023,7 +1034,7 @@ async fn init_timeline_state( continue; } - match LayerFileName::from_str(file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 73cdf6c6d40f..ca91ec24c650 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -31,7 +29,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, + pub(super) name: LayerName, pub(super) metadata: IndexLayerMetadata, #[serde_as(as = "TimestampSeconds")] @@ -42,7 +40,7 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, + name: LayerName, metadata: IndexLayerMetadata, access_time: SystemTime, ) -> Self { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4f1b56ef9f77..94a5e9ec47af 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,11 +1,11 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; @@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit; use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; @@ -646,8 +646,8 @@ pub mod tests { use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -658,8 +658,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -670,11 +670,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index b5538dff3af5..c38c9bb6564c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,6 +57,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tracing::*; @@ -68,7 +69,8 @@ use utils::{ }; use super::{ - AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, + AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, }; /// @@ -309,13 +311,13 @@ impl DeltaLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) @@ -1139,15 +1141,15 @@ impl DeltaLayerInner { Ok(all_keys) } - /// Using the given writer, write out a truncated version, where LSNs higher than the - /// truncate_at are missing. - #[cfg(test)] + /// Using the given writer, write out a version which has the earlier Lsns than `until`. + /// + /// Return the amount of key value records pushed to the writer. pub(super) async fn copy_prefix( &self, writer: &mut DeltaLayerWriter, - truncate_at: Lsn, + until: Lsn, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { use crate::tenant::vectored_blob_io::{ BlobMeta, VectoredReadBuilder, VectoredReadExtended, }; @@ -1211,6 +1213,8 @@ impl DeltaLayerInner { // FIXME: buffering of DeltaLayerWriter let mut per_blob_copy = Vec::new(); + let mut records = 0; + while let Some(item) = stream.try_next().await? { tracing::debug!(?item, "popped"); let offset = item @@ -1229,7 +1233,7 @@ impl DeltaLayerInner { prev = Option::from(item); - let actionable = actionable.filter(|x| x.0.lsn < truncate_at); + let actionable = actionable.filter(|x| x.0.lsn < until); let builder = if let Some((meta, offsets)) = actionable { // extend or create a new builder @@ -1297,7 +1301,7 @@ impl DeltaLayerInner { let will_init = crate::repository::ValueBytes::will_init(data) .inspect_err(|_e| { #[cfg(feature = "testing")] - tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value"); + tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); }) .unwrap_or(false); @@ -1314,7 +1318,10 @@ impl DeltaLayerInner { ) .await; per_blob_copy = tmp; + res?; + + records += 1; } buffer = Some(res.buf); @@ -1326,7 +1333,7 @@ impl DeltaLayerInner { "with the sentinel above loop should had handled all" ); - Ok(()) + Ok(records) } pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { @@ -1399,7 +1406,6 @@ impl DeltaLayerInner { Ok(()) } - #[cfg(test)] fn stream_index_forwards<'a, R>( &'a self, reader: &'a DiskBtreeReader, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 1477a1fc33cf..c9874873e4ba 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -54,6 +54,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; @@ -65,8 +66,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -231,7 +234,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -267,13 +270,13 @@ impl ImageLayer { .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -635,7 +638,7 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index ebc0cbf9a4bb..b5b0260327c4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,12 +4,13 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::heavier_once_cell; @@ -24,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; use super::image_layer; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, + AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; @@ -123,14 +124,42 @@ impl PartialEq for Layer { } } +pub(crate) fn local_layer_path( + conf: &PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer_file_name: &LayerName, + _generation: &Generation, +) -> Utf8PathBuf { + let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); + + timeline_path.join(layer_file_name.to_string()) + + // TODO: switch to enabling new-style layer paths after next release + // if generation.is_none() { + // // Without a generation, we may only use legacy path style + // timeline_path.join(layer_file_name.to_string()) + // } else { + // timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + // } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &file_name, + &metadata.generation, + ); + let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, @@ -143,6 +172,7 @@ impl Layer { let owner = Layer(Arc::new(LayerInner::new( conf, timeline, + local_path, access_stats, desc, None, @@ -159,7 +189,8 @@ impl Layer { pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + local_path: Utf8PathBuf, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( @@ -184,6 +215,7 @@ impl Layer { LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -225,9 +257,19 @@ impl Layer { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); + + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &desc.layer_name(), + &timeline.generation, + ); + LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -410,6 +452,13 @@ impl Layer { self.0.metadata() } + pub(crate) fn get_timeline_id(&self) -> Option { + self.0 + .timeline + .upgrade() + .map(|timeline| timeline.timeline_id) + } + /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { @@ -641,7 +690,7 @@ impl Drop for LayerInner { let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); @@ -709,19 +758,17 @@ impl Drop for LayerInner { } impl LayerInner { + #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, + local_path: Utf8PathBuf, access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { - let path = conf - .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) - .join(desc.filename().to_string()); - let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); @@ -736,8 +783,10 @@ impl LayerInner { LayerInner { conf, - debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() }, - path, + debug_str: { + format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() + }, + path: local_path, desc, timeline: Arc::downgrade(timeline), have_remote_client: timeline.remote_client.is_some(), @@ -1074,7 +1123,7 @@ impl LayerInner { let result = client .download_layer_file( - &self.desc.filename(), + &self.desc.layer_name(), &self.metadata(), &timeline.cancel, ctx, @@ -1211,7 +1260,7 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); let resident = self .inner @@ -1225,7 +1274,7 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, @@ -1236,7 +1285,7 @@ impl LayerInner { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, remote: !resident, @@ -1797,25 +1846,23 @@ impl ResidentLayer { } } - /// FIXME: truncate is bad name because we are not truncating anything, but copying the - /// filtered parts. - #[cfg(test)] - pub(super) async fn copy_delta_prefix( + /// Returns the amount of keys and values written to the writer. + pub(crate) async fn copy_delta_prefix( &self, writer: &mut super::delta_layer::DeltaLayerWriter, - truncate_at: Lsn, + until: Lsn, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { use LayerKind::*; let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { Delta(ref d) => d - .copy_prefix(writer, truncate_at, ctx) + .copy_prefix(writer, until, ctx) .await - .with_context(|| format!("truncate {self}")), - Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")), + .with_context(|| format!("copy_delta_prefix until {until} of {self}")), + Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")), } } diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index c375923e8191..a89b66e4a1b7 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; @@ -51,7 +51,7 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] @@ -103,14 +103,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +132,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 57% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index a98be0842bff..c733404693e1 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -2,40 +2,42 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::repository::Key; +use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use regex::Regex; use utils::lsn::Lsn; use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -55,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -74,10 +74,19 @@ impl DeltaFileName { let key_end_str = key_parts.next()?; let lsn_start_str = lsn_parts.next()?; let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } + if key_start_str.len() != 36 + || key_end_str.len() != 36 + || lsn_start_str.len() != 16 + || lsn_end_str.len() != 16 + { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -94,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -115,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -153,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -161,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -182,19 +189,23 @@ impl ImageFileName { return None; } + if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -205,21 +216,24 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The +/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -228,7 +242,7 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", @@ -236,7 +250,7 @@ impl LayerFileName { } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -245,23 +259,36 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; + /// Conversion from either a physical layer filename, or the string-ization of + /// Self. When loading a physical layer filename, we drop any extra information + /// not needed to build Self. fn from_str(value: &str) -> Result { - let delta = DeltaFileName::parse_str(value); - let image = ImageFileName::parse_str(value); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); + let file_name: Cow = match gen_suffix_regex.captures(value) { + Some(captures) => captures + .name("base") + .expect("Non-optional group") + .as_str() + .into(), + None => value.into(), + }; + + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -276,7 +303,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -288,19 +315,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -315,3 +342,42 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { v.parse().map_err(|e| E::custom(e)) } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn image_layer_parse() -> anyhow::Result<()> { + let expected = LayerName::Image(ImageLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn: Lsn::from_hex("00000000014FED58").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + Ok(()) + } + + #[test] + fn delta_layer_parse() -> anyhow::Result<()> { + let expected = LayerName::Delta(DeltaLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn_range: Lsn::from_hex("00000000014FED58").unwrap() + ..Lsn::from_hex("000000000154C481").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + Ok(()) + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 41b77c1f4a40..f153719f9825 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -2,6 +2,7 @@ //! such as compaction and GC use std::ops::ControlFlow; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -9,9 +10,11 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; +use rand::Rng; use tokio_util::sync::CancellationToken; use tracing::*; use utils::{backoff, completion}; @@ -44,6 +47,7 @@ pub(crate) enum BackgroundLoopKind { Compaction, Gc, Eviction, + IngestHouseKeeping, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, @@ -132,6 +136,30 @@ pub fn start_background_loops( } }, ); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::IngestHousekeeping, + Some(tenant_shard_id), + None, + &format!("ingest housekeeping for tenant {tenant_shard_id}"), + false, + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + ingest_housekeeping_loop(tenant, cancel) + .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + .await; + Ok(()) + } + }, + ); } /// @@ -379,6 +407,61 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + tokio::select! { + _ = cancel.cancelled() => { + return; + }, + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(()) => (), + }, + } + + // We run ingest housekeeping with the same frequency as compaction: it is not worth + // having a distinct setting. But we don't run it in the same task, because compaction + // blocks on acquiring the background job semaphore. + let period = tenant.get_compaction_period(); + + // If compaction period is set to zero (to disable it), then we will use a reasonable default + let period = if period == Duration::ZERO { + humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) + .unwrap() + .into() + } else { + period + }; + + // Jitter the period by +/- 5% + let period = + rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); + + // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of + // a tenant, since it won't have started writing any ephemeral files yet. + if tokio::time::timeout(period, cancel.cancelled()) + .await + .is_ok() + { + break; + } + + let started_at = Instant::now(); + tenant.ingest_housekeeping().await; + + warn_when_period_overrun( + started_at.elapsed(), + period, + BackgroundLoopKind::IngestHouseKeeping, + ); + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); +} + async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { @@ -420,8 +503,6 @@ pub(crate) async fn random_init_delay( period: Duration, cancel: &CancellationToken, ) -> Result<(), Cancelled> { - use rand::Rng; - if period == Duration::ZERO { return Ok(()); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c7a5598cece1..505dc8c30d09 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,6 @@ mod compaction; pub mod delete; +pub(crate) mod detach_ancestor; mod eviction_task; mod init; pub mod layer_manager; @@ -22,8 +23,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ - CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, - EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState, + AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo, + DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, + TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -58,6 +60,7 @@ use std::{ ops::ControlFlow, }; +use crate::tenant::timeline::init::LocalLayerFileMetadata; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -72,7 +75,7 @@ use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }, }; @@ -862,9 +865,13 @@ impl Timeline { // Initialise the reconstruct state for the key with the cache // entry returned above. let mut reconstruct_state = ValuesReconstructState::new(); - let mut key_state = VectoredValueReconstructState::default(); - key_state.img = cached_page_img; - reconstruct_state.keys.insert(key, Ok(key_state)); + + // Only add the cached image to the reconstruct state when it exists. + if cached_page_img.is_some() { + let mut key_state = VectoredValueReconstructState::default(); + key_state.img = cached_page_img; + reconstruct_state.keys.insert(key, Ok(key_state)); + } let vectored_res = self .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx) @@ -1076,7 +1083,7 @@ impl Timeline { // We should generalize this into Keyspace::contains in the future. for range in &keyspace.ranges { if range.start.field1 < METADATA_KEY_BEGIN_PREFIX - || range.end.field1 >= METADATA_KEY_END_PREFIX + || range.end.field1 > METADATA_KEY_END_PREFIX { return Err(GetVectoredError::Other(anyhow::anyhow!( "only metadata keyspace can be scanned" @@ -1213,11 +1220,17 @@ impl Timeline { } reconstruct_timer.stop_and_record(); - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + } Ok(results) } @@ -1494,15 +1507,21 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { + self.freeze_and_flush0().await + } + + // This exists to provide a non-span creating version of `freeze_and_flush` we can call without + // polluting the span hierarchy. + pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> { let to_lsn = self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait(to_lsn).await } - /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it. - /// - /// This is for use in background housekeeping, to provide guarantees of layers closing eventually - /// even if there are no ongoing writes to drive that. - async fn maybe_freeze_ephemeral_layer(&self) { + // Check if an open ephemeral layer should be closed: this provides + // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping + // an ephemeral layer open forever when idle. It also freezes layers if the global limit on + // ephemeral layer bytes has been breached. + pub(super) async fn maybe_freeze_ephemeral_layer(&self) { let Ok(_write_guard) = self.write_lock.try_lock() else { // If the write lock is held, there is an active wal receiver: rolling open layers // is their responsibility while they hold this lock. @@ -1529,13 +1548,11 @@ impl Timeline { // we are a sharded tenant and have skipped some WAL let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { - // This should be somewhat rare, so we log it at INFO level. - // - // We checked for checkpoint timeout so that a shard without any - // data ingested (yet) doesn't write a remote index as soon as it + // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard + // without any data ingested (yet) doesn't write a remote index as soon as it // sees its LSN advance: we only do this if we've been layer-less // for some time. - tracing::info!( + tracing::debug!( "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", disk_consistent_lsn, last_record_lsn @@ -1625,11 +1642,6 @@ impl Timeline { (guard, permit) }; - // Prior to compaction, check if an open ephemeral layer should be closed: this provides - // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping - // an ephemeral layer open forever when idle. - self.maybe_freeze_ephemeral_layer().await; - // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { @@ -1899,7 +1911,7 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, - layer_file_name: &str, + layer_file_name: &LayerName, ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1917,7 +1929,10 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { let _gate = self .gate .enter() @@ -1991,13 +2006,12 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { - #[allow(dead_code)] - pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool { + pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { let tenant_conf = self.tenant_conf.load(); tenant_conf .tenant_conf - .switch_to_aux_file_v2 - .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2) + .switch_aux_file_policy + .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) } pub(crate) fn get_lazy_slru_download(&self) -> bool { @@ -2379,13 +2393,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -2409,8 +2423,8 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(file_name, file_size) => { - discovered_layers.push((file_name, file_size)); + Discovered::Layer(layer_file_name, local_path, file_size) => { + discovered_layers.push((layer_file_name, local_path, file_size)); continue; } Discovered::Metadata => { @@ -2460,31 +2474,30 @@ impl Timeline { Ok(UseRemote { local, remote }) => { // Remote is authoritative, but we may still choose to retain // the local file if the contents appear to match - if local.file_size() == remote.file_size() { + if local.metadata.file_size() == remote.file_size() { // Use the local file, but take the remote metadata so that we pick up // the correct generation. - UseLocal(remote) + UseLocal( + LocalLayerFileMetadata { + metadata: remote, + local_path: local.local_path + } + ) } else { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); + init::cleanup_local_file_for_remote(&local, &remote)?; UseRemote { local, remote } } } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - path.push(name.file_name()); - init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?; - path.pop(); + if let Some(local) = local { + init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - path.push(name.file_name()); - init::cleanup_local_only_file(&path, &name, &local)?; - path.pop(); + init::cleanup_local_only_file(&name, &local)?; // this file never existed remotely, we will have to do rework continue; } @@ -2498,9 +2511,9 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - Layer::for_resident(conf, &this, name, m).drop_eviction_guard() + UseLocal(local) => { + total_physical_size += local.metadata.file_size(); + Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -2981,11 +2994,11 @@ impl Timeline { } } - async fn find_layer(&self, layer_file_name: &str) -> Option { + async fn find_layer(&self, layer_name: &LayerName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename().file_name(); - if layer_file_name == historic_layer_name { + let historic_layer_name = historic_layer.layer_name(); + if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } } @@ -3014,8 +3027,8 @@ impl Timeline { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( - layer.layer_desc().filename(), - layer.metadata().into(), + layer.layer_desc().layer_name(), + (&layer.metadata()).into(), last_activity_ts, ) }); @@ -3024,6 +3037,18 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self + .remote_client + .as_ref() + .is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn))) + } } type TraversalId = Arc; @@ -3161,7 +3186,7 @@ impl Timeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); @@ -3190,7 +3215,7 @@ impl Timeline { for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); let frozen_layer = frozen_layer.clone(); @@ -3517,7 +3542,7 @@ impl Timeline { Ok(ancestor) } - fn get_ancestor_timeline(&self) -> anyhow::Result> { + pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result> { let ancestor = self.ancestor_timeline.as_ref().with_context(|| { format!( "Ancestor is missing. Timeline id: {} Ancestor id {:?}", @@ -4217,7 +4242,7 @@ impl Timeline { // Maybe flush `key_rest_accum` if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS - || last_key_in_range + || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) @@ -4333,6 +4358,48 @@ impl Timeline { _ = self.cancel.cancelled() => {} ) } + + /// Detach this timeline from its ancestor by copying all of ancestors layers as this + /// Timelines layers up to the ancestor_lsn. + /// + /// Requires a timeline that: + /// - has an ancestor to detach from + /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not + /// a technical requirement + /// + /// After the operation has been started, it cannot be canceled. Upon restart it needs to be + /// polled again until completion. + /// + /// During the operation all timelines sharing the data with this timeline will be reparented + /// from our ancestor to be branches of this timeline. + pub(crate) async fn prepare_to_detach_from_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + options: detach_ancestor::Options, + ctx: &RequestContext, + ) -> Result< + ( + completion::Completion, + detach_ancestor::PreparedTimelineDetach, + ), + detach_ancestor::Error, + > { + detach_ancestor::prepare(self, tenant, options, ctx).await + } + + /// Completes the ancestor detach. This method is to be called while holding the + /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any + /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// + /// Pageserver receiving a SIGKILL during this operation is not supported (yet). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + prepared: detach_ancestor::PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + detach_ancestor::complete(self, tenant, prepared, ctx).await + } } /// Top-level failure to compact. @@ -4441,6 +4508,24 @@ impl Timeline { Ok(()) } + async fn rewrite_layers( + self: &Arc, + replace_layers: Vec<(Layer, ResidentLayer)>, + drop_layers: Vec, + ) -> anyhow::Result<()> { + let mut guard = self.layers.write().await; + + guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + + let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); + + if let Some(remote_client) = self.remote_client.as_ref() { + remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?; + } + + Ok(()) + } + /// Schedules the uploads of the given image layers fn upload_new_image_layers( self: &Arc, @@ -4599,6 +4684,8 @@ impl Timeline { retain_lsns: Vec, new_gc_cutoff: Lsn, ) -> anyhow::Result { + // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc + let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); @@ -4652,7 +4739,7 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), + l.layer_name(), horizon_cutoff, ); result.layers_needed_by_cutoff += 1; @@ -4663,7 +4750,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), + l.layer_name(), pitr_cutoff, ); result.layers_needed_by_pitr += 1; @@ -4682,7 +4769,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -4713,7 +4800,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4721,7 +4808,7 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 1088101a1316..e83878b8fbb4 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -15,7 +15,8 @@ use anyhow::{anyhow, Context}; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; @@ -93,7 +94,7 @@ impl Timeline { // Define partitioning schema if needed // FIXME: the match should only cover repartitioning, not the next steps - match self + let partition_count = match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -146,6 +147,7 @@ impl Timeline { assert!(sparse_layers.is_empty()); self.upload_new_image_layers(dense_layers)?; + dense_partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -157,9 +159,150 @@ impl Timeline { if !self.cancel.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } + 1 } }; + if self.shard_identity.count >= ShardCount::new(2) { + // Limit the number of layer rewrites to the number of partitions: this means its + // runtime should be comparable to a full round of image layer creations, rather than + // being potentially much longer. + let rewrite_max = partition_count; + + self.compact_shard_ancestors(rewrite_max, ctx).await?; + } + + Ok(()) + } + + /// Check for layers that are elegible to be rewritten: + /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that + /// we don't indefinitely retain keys in this shard that aren't needed. + /// - For future use: layers beyond pitr_interval that are in formats we would + /// rather not maintain compatibility with indefinitely. + /// + /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound + /// how much work it will try to do in each compaction pass. + async fn compact_shard_ancestors( + self: &Arc, + rewrite_max: usize, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut drop_layers = Vec::new(); + let layers_to_rewrite: Vec = Vec::new(); + + // We will use the PITR cutoff as a condition for rewriting layers. + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr; + + let layers = self.layers.read().await; + for layer_desc in layers.layer_map().iter_historic_layers() { + let layer = layers.get_from_desc(&layer_desc); + if layer.metadata().shard.shard_count == self.shard_identity.count { + // This layer does not belong to a historic ancestor, no need to re-image it. + continue; + } + + // This layer was created on an ancestor shard: check if it contains any data for this shard. + let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity); + let layer_local_page_count = sharded_range.page_count(); + let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range()); + if layer_local_page_count == 0 { + // This ancestral layer only covers keys that belong to other shards. + // We include the full metadata in the log: if we had some critical bug that caused + // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard.", + ); + + if cfg!(debug_assertions) { + // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being + // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer + // should be !is_key_disposable() + let range = layer_desc.get_key_range(); + let mut key = range.start; + while key < range.end { + debug_assert!(self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + drop_layers.push(layer); + continue; + } else if layer_local_page_count != u32::MAX + && layer_local_page_count == layer_raw_page_count + { + debug!(%layer, + "layer is entirely shard local ({} keys), no need to filter it", + layer_local_page_count + ); + continue; + } + + // Don't bother re-writing a layer unless it will at least halve its size + if layer_local_page_count != u32::MAX + && layer_local_page_count > layer_raw_page_count / 2 + { + debug!(%layer, + "layer is already mostly local ({}/{}), not rewriting", + layer_local_page_count, + layer_raw_page_count + ); + } + + // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually + // without incurring the I/O cost of a rewrite. + if layer_desc.get_lsn_range().end >= pitr_cutoff { + debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})", + layer_desc.get_lsn_range().end, pitr_cutoff); + continue; + } + + if layer_desc.is_delta() { + // We do not yet implement rewrite of delta layers + debug!(%layer, "Skipping rewrite of delta layer"); + continue; + } + + // Only rewrite layers if they would have different remote paths: either they belong to this + // shard but an old generation, or they belonged to another shard. This also implicitly + // guarantees that the layer is persistent in remote storage (as only remote persistent + // layers are carried across shard splits, any local-only layer would be in the current generation) + if layer.metadata().generation == self.generation + && layer.metadata().shard.shard_count == self.shard_identity.count + { + debug!(%layer, "Skipping rewrite, is not from old generation"); + continue; + } + + if layers_to_rewrite.len() >= rewrite_max { + tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + layers_to_rewrite.len() + ); + continue; + } + + // Fall through: all our conditions for doing a rewrite passed. + // TODO: implement rewriting + tracing::debug!(%layer, "Would rewrite layer"); + } + + // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`] + drop(layers); + + // TODO: collect layers to rewrite + let replace_layers = Vec::new(); + + // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage + self.rewrite_layers(replace_layers, drop_layers).await?; + + if let Some(remote_client) = self.remote_client.as_ref() { + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + remote_client.wait_completion().await?; + } + Ok(()) } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index af10c1c84b76..d8701be17013 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -422,6 +422,10 @@ impl DeleteTimelineFlow { pub(crate) fn is_finished(&self) -> bool { matches!(self, Self::Finished) } + + pub(crate) fn is_not_started(&self) -> bool { + matches!(self, Self::NotStarted) + } } struct DeletionGuard(OwnedMutexGuard); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs new file mode 100644 index 000000000000..69b82344a652 --- /dev/null +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -0,0 +1,540 @@ +use std::sync::Arc; + +use super::{layer_manager::LayerManager, Timeline}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::{ + storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + Tenant, + }, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Error { + #[error("no ancestors")] + NoAncestor, + #[error("too many ancestors")] + TooManyAncestors, + #[error("shutting down, please retry later")] + ShuttingDown, + #[error("flushing failed")] + FlushAncestor(#[source] anyhow::Error), + #[error("layer download failed")] + RewrittenDeltaDownloadFailed(#[source] anyhow::Error), + #[error("copying LSN prefix locally failed")] + CopyDeltaPrefix(#[source] anyhow::Error), + #[error("upload rewritten layer")] + UploadRewritten(#[source] anyhow::Error), + + #[error("ancestor is already being detached by: {}", .0)] + OtherTimelineDetachOngoing(TimelineId), + + #[error("remote copying layer failed")] + CopyFailed(#[source] anyhow::Error), + + #[error("unexpected error")] + Unexpected(#[source] anyhow::Error), +} + +pub(crate) struct PreparedTimelineDetach { + layers: Vec, +} + +/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +#[derive(Debug)] +pub(crate) struct Options { + pub(crate) rewrite_concurrency: std::num::NonZeroUsize, + pub(crate) copy_concurrency: std::num::NonZeroUsize, +} + +impl Default for Options { + fn default() -> Self { + Self { + rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(), + } + } +} + +/// See [`Timeline::prepare_to_detach_from_ancestor`] +pub(super) async fn prepare( + detached: &Arc, + tenant: &Tenant, + options: Options, + ctx: &RequestContext, +) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { + use Error::*; + + if detached.remote_client.as_ref().is_none() { + unimplemented!("no new code for running without remote storage"); + } + + let Some((ancestor, ancestor_lsn)) = detached + .ancestor_timeline + .as_ref() + .map(|tl| (tl.clone(), detached.ancestor_lsn)) + else { + return Err(NoAncestor); + }; + + if !ancestor_lsn.is_valid() { + return Err(NoAncestor); + } + + if ancestor.ancestor_timeline.is_some() { + // non-technical requirement; we could flatten N ancestors just as easily but we chose + // not to + return Err(TooManyAncestors); + } + + // before we acquire the gate, we must mark the ancestor as having a detach operation + // ongoing which will block other concurrent detach operations so we don't get to ackward + // situations where there would be two branches trying to reparent earlier branches. + let (guard, barrier) = completion::channel(); + + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + } + *guard = Some((detached.timeline_id, barrier)); + } + + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { + let span = + tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); + async { + let started_at = std::time::Instant::now(); + let freeze_and_flush = ancestor.freeze_and_flush0(); + let mut freeze_and_flush = std::pin::pin!(freeze_and_flush); + + let res = + tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush) + .await; + + let res = match res { + Ok(res) => res, + Err(_elapsed) => { + tracing::info!("freezing and flushing ancestor is still ongoing"); + freeze_and_flush.await + } + }; + + res.map_err(FlushAncestor)?; + + // we do not need to wait for uploads to complete but we do need `struct Layer`, + // copying delta prefix is unsupported currently for `InMemoryLayer`. + tracing::info!( + elapsed_ms = started_at.elapsed().as_millis(), + "froze and flushed the ancestor" + ); + Ok(()) + } + .instrument(span) + .await?; + } + + let end_lsn = ancestor_lsn + 1; + + let (filtered_layers, straddling_branchpoint, rest_of_historic) = { + // we do not need to start from our layers, because they can only be layers that come + // *after* ancestor_lsn + let layers = tokio::select! { + guard = ancestor.layers.read() => guard, + _ = detached.cancel.cancelled() => { + return Err(ShuttingDown); + } + _ = ancestor.cancel.cancelled() => { + return Err(ShuttingDown); + } + }; + + // between retries, these can change if compaction or gc ran in between. this will mean + // we have to redo work. + partition_work(ancestor_lsn, &layers) + }; + + // TODO: layers are already sorted by something: use that to determine how much of remote + // copies are already done. + tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); + + // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after + let mut new_layers: Vec = + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + + { + tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + + let mut tasks = tokio::task::JoinSet::new(); + + let mut wrote_any = false; + + let limiter = Arc::new(tokio::sync::Semaphore::new( + options.rewrite_concurrency.get(), + )); + + for layer in straddling_branchpoint { + let limiter = limiter.clone(); + let timeline = detached.clone(); + let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); + + tasks.spawn(async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + Ok(copied) + }); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(Some(copied))) => { + wrote_any = true; + tracing::info!(layer=%copied, "rewrote and uploaded"); + new_layers.push(copied); + } + Ok(Ok(None)) => {} + Ok(Err(e)) => return Err(e), + Err(je) => return Err(Unexpected(je.into())), + } + } + + // FIXME: the fsync should be mandatory, after both rewrites and copies + if wrote_any { + let timeline_dir = VirtualFile::open( + &detached + .conf + .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + } + + let mut tasks = tokio::task::JoinSet::new(); + let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + + for adopted in rest_of_historic { + let limiter = limiter.clone(); + let timeline = detached.clone(); + + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let owned = + remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?; + tracing::info!(layer=%owned, "remote copied"); + Ok(owned) + } + .in_current_span(), + ); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(owned)) => { + new_layers.push(owned); + } + Ok(Err(failed)) => { + return Err(failed); + } + Err(je) => return Err(Unexpected(je.into())), + } + } + + // TODO: fsync directory again if we hardlinked something + + let prepared = PreparedTimelineDetach { layers: new_layers }; + + Ok((guard, prepared)) +} + +fn partition_work( + ancestor_lsn: Lsn, + source_layermap: &LayerManager, +) -> (usize, Vec, Vec) { + let mut straddling_branchpoint = vec![]; + let mut rest_of_historic = vec![]; + + let mut later_by_lsn = 0; + + for desc in source_layermap.layer_map().iter_historic_layers() { + // off by one chances here: + // - start is inclusive + // - end is exclusive + if desc.lsn_range.start > ancestor_lsn { + later_by_lsn += 1; + continue; + } + + let target = if desc.lsn_range.start <= ancestor_lsn + && desc.lsn_range.end > ancestor_lsn + && desc.is_delta + { + // TODO: image layer at Lsn optimization + &mut straddling_branchpoint + } else { + &mut rest_of_historic + }; + + target.push(source_layermap.get_from_desc(&desc)); + } + + (later_by_lsn, straddling_branchpoint, rest_of_historic) +} + +async fn upload_rewritten_layer( + end_lsn: Lsn, + layer: &Layer, + target: &Arc, + cancel: &CancellationToken, + ctx: &RequestContext, +) -> Result, Error> { + use Error::UploadRewritten; + let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; + + let Some(copied) = copied else { + return Ok(None); + }; + + // FIXME: better shuttingdown error + target + .remote_client + .as_ref() + .unwrap() + .upload_layer_file(&copied, cancel) + .await + .map_err(UploadRewritten)?; + + Ok(Some(copied.into())) +} + +async fn copy_lsn_prefix( + end_lsn: Lsn, + layer: &Layer, + target_timeline: &Arc, + ctx: &RequestContext, +) -> Result, Error> { + use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + + tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); + + let mut writer = DeltaLayerWriter::new( + target_timeline.conf, + target_timeline.timeline_id, + target_timeline.tenant_shard_id, + layer.layer_desc().key_range.start, + layer.layer_desc().lsn_range.start..end_lsn, + ) + .await + .map_err(CopyDeltaPrefix)?; + + let resident = layer + .download_and_keep_resident() + .await + // likely shutdown + .map_err(RewrittenDeltaDownloadFailed)?; + + let records = resident + .copy_delta_prefix(&mut writer, end_lsn, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + drop(resident); + + tracing::debug!(%layer, records, "copied records"); + + if records == 0 { + drop(writer); + // TODO: we might want to store an empty marker in remote storage for this + // layer so that we will not needlessly walk `layer` on repeated attempts. + Ok(None) + } else { + // reuse the key instead of adding more holes between layers by using the real + // highest key in the layer. + let reused_highest_key = layer.layer_desc().key_range.end; + let copied = writer + .finish(reused_highest_key, target_timeline, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + tracing::debug!(%layer, %copied, "new layer produced"); + + Ok(Some(copied)) + } +} + +/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote +/// storage on successful return without the adopted layer being added to `index_part.json`. +async fn remote_copy( + adopted: &Layer, + adoptee: &Arc, + generation: Generation, + cancel: &CancellationToken, +) -> Result { + use Error::CopyFailed; + + // depending if Layer::keep_resident we could hardlink + + let mut metadata = adopted.metadata(); + debug_assert!(metadata.generation <= generation); + metadata.generation = generation; + + let owned = crate::tenant::storage_layer::Layer::for_evicted( + adoptee.conf, + adoptee, + adopted.layer_desc().layer_name(), + metadata, + ); + + // FIXME: better shuttingdown error + adoptee + .remote_client + .as_ref() + .unwrap() + .copy_timeline_layer(adopted, &owned, cancel) + .await + .map(move |()| owned) + .map_err(CopyFailed) +} + +/// See [`Timeline::complete_detaching_timeline_ancestor`]. +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + prepared: PreparedTimelineDetach, + _ctx: &RequestContext, +) -> Result, anyhow::Error> { + let rtc = detached + .remote_client + .as_ref() + .expect("has to have a remote timeline client for timeline ancestor detach"); + + let PreparedTimelineDetach { layers } = prepared; + + let ancestor = detached + .get_ancestor_timeline() + .expect("must still have a ancestor"); + let ancestor_lsn = detached.get_ancestor_lsn(); + + // publish the prepared layers before we reparent any of the timelines, so that on restart + // reparented timelines find layers. also do the actual detaching. + // + // if we crash after this operation, we will at least come up having detached a timeline, but + // we cannot go back and reparent the timelines which would had been reparented in normal + // execution. + // + // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart + // which could give us a completely wrong layer combination. + rtc.schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await?; + + let mut tasks = tokio::task::JoinSet::new(); + + // because we are now keeping the slot in progress, it is unlikely that there will be any + // timeline deletions during this time. if we raced one, then we'll just ignore it. + tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + if !tl.is_active() { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl.clone()) + } else { + None + } + }) + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + + tasks.spawn( + async move { + let res = timeline + .remote_client + .as_ref() + .expect("reparented has to have remote client because detached has one") + .schedule_reparenting_and_wait(&new_parent) + .await; + + match res { + Ok(()) => Some(timeline), + Err(e) => { + // with the use of tenant slot, we no longer expect these. + tracing::warn!("reparenting failed: {e:#}"); + None + } + } + } + .instrument(span), + ); + }); + + let reparenting_candidates = tasks.len(); + let mut reparented = Vec::with_capacity(tasks.len()); + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Some(timeline)) => { + tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); + reparented.push(timeline.timeline_id); + } + Ok(None) => { + // lets just ignore this for now. one or all reparented timelines could had + // started deletion, and that is fine. + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // ignore; it's better to continue with a single reparenting failing (or even + // all of them) in order to get to the goal state. + // + // these timelines will never be reparentable, but they can be always detached as + // separate tree roots. + } + Err(je) => tracing::error!("unexpected join error: {je:?}"), + } + } + + if reparenting_candidates != reparented.len() { + tracing::info!("failed to reparent some candidates"); + } + + Ok(reparented) +} diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 916ebfc6d9f2..66aa76501510 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,13 +6,13 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, + storage_layer::LayerName, Generation, }, METADATA_FILE_NAME, }; use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::ShardIndex; use std::{collections::HashMap, str::FromStr}; use utils::lsn::Lsn; @@ -20,7 +20,7 @@ use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, u64), + Layer(LayerName, Utf8PathBuf, u64), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -43,10 +43,10 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, file_size) + Discovered::Layer(file_name, direntry.path().to_owned(), file_size) } Err(_) => { if file_name == METADATA_FILE_NAME { @@ -72,6 +72,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + metadata: LayerFileMetadata::new(file_size, generation, shard), + } + } +} + /// Decision on what to do with a layer file after considering its local and remote metadata. #[derive(Clone, Debug)] pub(super) enum Decision { @@ -80,11 +102,11 @@ pub(super) enum Decision { /// The layer is present locally, but local metadata does not match remote; we must /// delete it and treat it as evicted. UseRemote { - local: LayerFileMetadata, + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), + UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,39 +114,42 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, u64)>, + discovered: Vec<(LayerName, Utf8PathBuf, u64)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, shard: ShardIndex, -) -> Vec<(LayerFileName, Result)> { +) -> Vec<(LayerName, Result)> { use Decision::*; - // name => (local, remote) - type Collected = HashMap, Option)>; + // name => (local_metadata, remote_metadata) + type Collected = + HashMap, Option)>; let mut discovered = discovered .into_iter() - .map(|(name, file_size)| { + .map(|(layer_name, local_path, file_size)| { ( - name, + layer_name, // The generation and shard here will be corrected to match IndexPart in the merge below, unless // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. ( - Some(LayerFileMetadata::new(file_size, generation, shard)), + Some(LocalLayerFileMetadata::new( + local_path, file_size, generation, shard, + )), None, ), ) @@ -153,7 +178,7 @@ pub(super) fn reconcile( Err(DismissedLayer::Future { local }) } else { match (local, remote) { - (Some(local), Some(remote)) if local != remote => { + (Some(local), Some(remote)) if local.metadata != remote => { Ok(UseRemote { local, remote }) } (Some(x), Some(_)) => Ok(UseLocal(x)), @@ -177,12 +202,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { } pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, + local: &LocalLayerFileMetadata, remote: &LayerFileMetadata, ) -> anyhow::Result<()> { - let local_size = local.file_size(); + let local_size = local.metadata.file_size(); let remote_size = remote.file_size(); + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); @@ -199,7 +224,7 @@ pub(super) fn cleanup_local_file_for_remote( pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -211,12 +236,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name}, metadata {:?}", + local.metadata + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 64edcc5e409b..a72eb1b3bffb 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -205,6 +205,24 @@ impl LayerManager { updates.flush(); } + /// Called when compaction is completed. + pub(crate) fn rewrite_layers( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + _metrics: &TimelineMetrics, + ) { + let mut updates = self.layer_map.batch_update(); + + // TODO: implement rewrites (currently this code path only used for drops) + assert!(rewrite_layers.is_empty()); + + for l in drop_layers { + Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + /// Called when garbage collect has selected the layers to be removed. pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); @@ -276,7 +294,7 @@ impl LayerFileManager { // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.0 .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) + .with_context(|| format!("get layer from desc: {}", desc.layer_name())) .expect("not found") .clone() } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 0bf4d1e5991f..a2f761fa949a 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,8 +1,9 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::index::Lineage; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -45,7 +46,7 @@ pub(crate) struct UploadQueueInitialized { /// All layer files stored in the remote storage, taking into account all /// in-progress and queued operations - pub(crate) latest_files: HashMap, + pub(crate) latest_files: HashMap, /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? @@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized { /// DANGER: do not return to outside world, e.g., safekeepers. pub(crate) latest_metadata: TimelineMetadata, + /// Part of the flattened "next" `index_part.json`. + pub(crate) latest_lineage: Lineage, + /// `disk_consistent_lsn` from the last metadata file that was successfully /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. @@ -89,7 +93,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -171,6 +175,7 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), + latest_lineage: Lineage::default(), projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations @@ -218,6 +223,7 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), + latest_lineage: index_part.lineage.clone(), projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), @@ -281,7 +287,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] @@ -290,7 +296,7 @@ pub(crate) enum UploadOp { UploadLayer(ResidentLayer, LayerFileMetadata), /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + UploadMetadata(Box, Lsn), /// Delete layer files Delete(Delete), diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 0bcb9545a674..cd316dbb9141 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -14,7 +14,8 @@ OBJS = \ relsize_cache.o \ walproposer.o \ walproposer_pg.o \ - control_plane_connector.o + control_plane_connector.o \ + walsender_hooks.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 8d236144b5c5..b69a3819c9b5 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -34,6 +34,7 @@ #include "walproposer.h" #include "pagestore_client.h" #include "control_plane_connector.h" +#include "walsender_hooks.h" PG_MODULE_MAGIC; void _PG_init(void); @@ -265,7 +266,6 @@ LogicalSlotsMonitorMain(Datum main_arg) } } - void _PG_init(void) { @@ -279,6 +279,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index f7ec9e5bfae9..e43f4d9d96c0 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -36,10 +36,7 @@ static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); -static void NeonWALReaderResetRemote(NeonWALReader *state); static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); -static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); -static void neon_wal_segment_close(NeonWALReader *state); static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli); @@ -82,8 +79,9 @@ struct NeonWALReader XLogRecPtr req_lsn; Size req_len; Size req_progress; - WalProposer *wp; /* we learn donor through walproposer */ + char donor_conninfo[MAXCONNINFO]; char donor_name[64]; /* saved donor safekeeper name for logging */ + XLogRecPtr donor_lsn; /* state of connection to safekeeper */ NeonWALReaderRemoteState rem_state; WalProposerConn *wp_conn; @@ -107,7 +105,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) { NeonWALReader *reader; @@ -123,8 +121,6 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalPropose reader->seg.ws_tli = 0; reader->segcxt.ws_segsize = wal_segment_size; - reader->wp = wp; - reader->rem_state = RS_NONE; if (log_prefix) @@ -204,21 +200,16 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { if (state->rem_state == RS_NONE) { - XLogRecPtr donor_lsn; - - /* no connection yet; start one */ - Safekeeper *donor = GetDonor(state->wp, &donor_lsn); - - if (donor == NULL) + if (!NeonWALReaderUpdateDonor(state)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to establish remote connection to fetch WAL: no donor available"); return NEON_WALREAD_ERROR; + } - snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); - nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", - state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); - state->wp_conn = libpqwp_connect_start(donor->conninfo); + /* no connection yet; start one */ + nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn)); + state->wp_conn = libpqwp_connect_start(state->donor_conninfo); if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) { snprintf(state->err_msg, sizeof(state->err_msg), @@ -251,10 +242,22 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { /* connection successfully established */ char start_repl_query[128]; - + term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm); + + /* + * Set elected walproposer's term to pull only data from + * its history. Note: for logical walsender it means we + * might stream WAL not yet committed by safekeepers. It + * would be cleaner to fix this. + * + * mineLastElectedTerm shouldn't be 0 at this point + * because we checked above that donor exists and it + * appears only after successfull election. + */ + Assert(term > 0); snprintf(start_repl_query, sizeof(start_repl_query), "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", - LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + LSN_FORMAT_ARGS(startptr), term); nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", state->donor_name, start_repl_query); if (!libpqwp_send_query(state->wp_conn, start_repl_query)) @@ -404,6 +407,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + return NEON_WALREAD_SUCCESS; } } @@ -526,7 +533,7 @@ NeonWALReaderReadMsg(NeonWALReader *state) } /* reset remote connection and request in progress */ -static void +void NeonWALReaderResetRemote(NeonWALReader *state) { state->req_lsn = InvalidXLogRecPtr; @@ -691,13 +698,25 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return true; } +XLogRecPtr +NeonWALReaderGetRemLsn(NeonWALReader *state) +{ + return state->rem_lsn; +} + +const WALOpenSegment * +NeonWALReaderGetSegment(NeonWALReader *state) +{ + return &state->seg; +} + /* * Copy of vanilla wal_segment_open, but returns false in case of error instead * of ERROR, with errno set. * * XLogReaderRoutine->segment_open callback for local pg_wal files */ -static bool +bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p) { @@ -724,7 +743,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) } /* copy of vanilla wal_segment_close with NeonWALReader */ -static void +void neon_wal_segment_close(NeonWALReader *state) { if (state->seg.ws_file >= 0) @@ -740,3 +759,19 @@ NeonWALReaderErrMsg(NeonWALReader *state) { return state->err_msg; } + +/* + * Returns true if there is a donor, and false otherwise + */ +bool +NeonWALReaderUpdateDonor(NeonWALReader *state) +{ + WalproposerShmemState *wps = GetWalpropShmemState(); + + SpinLockAcquire(&wps->mutex); + memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name)); + memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo)); + state->donor_lsn = wps->donor_lsn; + SpinLockRelease(&wps->mutex); + return state->donor_name[0] != '\0'; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 6be9f149aa15..3e4182506980 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,12 +19,19 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); extern void NeonWALReaderFree(NeonWALReader *state); +extern void NeonWALReaderResetRemote(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); extern char *NeonWALReaderErrMsg(NeonWALReader *state); +extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state); +extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state); +extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +extern void neon_wal_segment_close(NeonWALReader *state); +extern bool NeonWALReaderUpdateDonor(NeonWALReader *state); + #endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index d7987954d4a2..dbc67a24f5ca 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b); static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); - +static void UpdateDonorShmem(WalProposer *wp); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -922,7 +922,8 @@ static void DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; - int n_ready = 0; + int n_ready = 0; + WalproposerShmemState *walprop_shared; wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; @@ -964,16 +965,18 @@ DetermineEpochStartLsn(WalProposer *wp) if (n_ready < wp->quorum) { /* - * This is a rare case that can be triggered if safekeeper has voted and disconnected. - * In this case, its state will not be SS_IDLE and its vote cannot be used, because - * we clean up `voteResponse` in `ShutdownConnection`. + * This is a rare case that can be triggered if safekeeper has voted + * and disconnected. In this case, its state will not be SS_IDLE and + * its vote cannot be used, because we clean up `voteResponse` in + * `ShutdownConnection`. */ wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); } /* - * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping - * and nothing was committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are + * bootstrapping and nothing was committed yet. Start streaming then from + * the basebackup LSN. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { @@ -984,11 +987,12 @@ DetermineEpochStartLsn(WalProposer *wp) } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } + pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so it - * should never be zero at this point, if we know timelineStartLsn. - * + * Safekeepers are setting truncateLsn after timelineStartLsn is known, so + * it should never be zero at this point, if we know timelineStartLsn. + * * timelineStartLsn can be zero only on the first syncSafekeepers run. */ Assert((wp->truncateLsn != InvalidXLogRecPtr) || @@ -1022,10 +1026,9 @@ DetermineEpochStartLsn(WalProposer *wp) * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ + walprop_shared = wp->api.get_shmem_state(wp); if (!wp->config->syncSafekeepers) { - WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp); - /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. @@ -1040,7 +1043,7 @@ DetermineEpochStartLsn(WalProposer *wp) * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == - walprop_shared->mineLastElectedTerm))) + pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) { /* * Panic to restart PG as we need to retake basebackup. @@ -1054,8 +1057,8 @@ DetermineEpochStartLsn(WalProposer *wp) LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } - walprop_shared->mineLastElectedTerm = wp->propTerm; } + pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm); } /* @@ -1105,9 +1108,13 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + + /* + * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline + * is created manually (test_s3_wal_replay) + */ Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else @@ -1177,6 +1184,12 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* + * Donors can only be in SS_ACTIVE state, so we potentially update the + * donor when we switch one to SS_ACTIVE. + */ + UpdateDonorShmem(sk->wp); + /* event set will be updated inside SendMessageToNode */ SendMessageToNode(sk); } @@ -1568,17 +1581,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) * none if it doesn't exist. donor_lsn is set to end position of the donor to * the best of our knowledge. */ -Safekeeper * -GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +static void +UpdateDonorShmem(WalProposer *wp) { Safekeeper *donor = NULL; int i; - *donor_lsn = InvalidXLogRecPtr; + XLogRecPtr donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { - wp_log(WARNING, "GetDonor called before elections are won"); - return NULL; + wp_log(WARNING, "UpdateDonorShmem called before elections are won"); + return; } /* @@ -1589,7 +1602,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) if (wp->safekeeper[wp->donor].state >= SS_IDLE) { donor = &wp->safekeeper[wp->donor]; - *donor_lsn = wp->propEpochStartLsn; + donor_lsn = wp->propEpochStartLsn; } /* @@ -1601,13 +1614,19 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { Safekeeper *sk = &wp->safekeeper[i]; - if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn) { donor = sk; - *donor_lsn = sk->appendResponse.flushLsn; + donor_lsn = sk->appendResponse.flushLsn; } } - return donor; + + if (donor == NULL) + { + wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping"); + return; + } + wp->api.update_donor(wp, donor, donor_lsn); } /* @@ -1617,7 +1636,7 @@ static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { XLogRecPtr candidateTruncateLsn; - XLogRecPtr newCommitLsn; + XLogRecPtr newCommitLsn; newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); if (newCommitLsn > wp->commitLsn) @@ -1627,7 +1646,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) BroadcastAppendRequest(wp); } - /* + /* * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). * The last one will terminate the process if the shutdown is requested * and WAL is committed by the quorum. BroadcastAppendRequest() should be diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 69a557fdf2d8..41daeb87b981 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -284,14 +284,19 @@ typedef struct PageserverFeedback typedef struct WalproposerShmemState { + pg_atomic_uint64 propEpochStartLsn; + char donor_name[64]; + char donor_conninfo[MAXCONNINFO]; + XLogRecPtr donor_lsn; + slock_t mutex; - term_t mineLastElectedTerm; + pg_atomic_uint64 mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; pg_atomic_uint64 currentClusterSize; /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; - int num_shards; + int num_shards; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; @@ -465,6 +470,9 @@ typedef struct walproposer_api /* Get pointer to the latest available WAL. */ XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp); + /* Update current donor info in WalProposer Shmem */ + void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn); + /* Get current time. */ TimestampTz (*get_current_timestamp) (WalProposer *wp); @@ -497,7 +505,7 @@ typedef struct walproposer_api * * On success, the data is placed in *buf. It is valid until the next call * to this function. - * + * * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); @@ -545,13 +553,14 @@ typedef struct walproposer_api * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. - * + * * On timeout, events is set to WL_NO_EVENTS. On socket event, events is * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is * closed, events is set to WL_SOCKET_READABLE. - * - * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer. - * It can be returned only if caller asked for this event in the last *_event_set call. + * + * WL_SOCKET_WRITEABLE is usually set only when we need to flush the + * buffer. It can be returned only if caller asked for this event in the + * last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); @@ -571,9 +580,9 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every AppendResponse from the safekeeper. Used to propagate - * backpressure feedback and to confirm WAL persistence (has been commited - * on the quorum of safekeepers). + * Called after every AppendResponse from the safekeeper. Used to + * propagate backpressure feedback and to confirm WAL persistence (has + * been commited on the quorum of safekeepers). */ void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); @@ -716,12 +725,14 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +extern WalproposerShmemState *GetWalpropShmemState(); + /* * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to * recreate set from scratch, hence the export. */ extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); -extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); +extern TimeLineID walprop_pg_get_timeline_id(void); #define WPEVENT 1337 /* special log level for walproposer internal diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 7debb6325ea2..e5ef93b45632 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -85,7 +85,6 @@ static void walprop_pg_init_standalone_sync_safekeepers(void); static void walprop_pg_init_walsender(void); static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); -static TimeLineID walprop_pg_get_timeline_id(void); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; @@ -94,6 +93,8 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type; static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif +static void WalproposerShmemInit_SyncSafekeeper(void); + static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); static void WalSndLoop(WalProposer *wp); @@ -136,6 +137,7 @@ WalProposerSync(int argc, char *argv[]) WalProposer *wp; init_walprop_config(true); + WalproposerShmemInit_SyncSafekeeper(); walprop_pg_init_standalone_sync_safekeepers(); walprop_pg_load_libpqwalreceiver(); @@ -281,6 +283,8 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); } @@ -289,6 +293,17 @@ WalproposerShmemInit(void) return found; } +static void +WalproposerShmemInit_SyncSafekeeper(void) +{ + walprop_shared = palloc(WalproposerShmemSize()); + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); +} + #define BACK_PRESSURE_DELAY 10000L // 0.01 sec static bool @@ -399,6 +414,13 @@ nwp_shmem_startup_hook(void) WalproposerShmemInit(); } +WalproposerShmemState * +GetWalpropShmemState() +{ + Assert(walprop_shared != NULL); + return walprop_shared; +} + static WalproposerShmemState * walprop_pg_get_shmem_state(WalProposer *wp) { @@ -431,14 +453,15 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback) for (int i = 0; i < walprop_shared->num_shards; i++) { PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + if (feedback->present) { if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) min_feedback.last_received_lsn = feedback->last_received_lsn; - + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; - + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; } @@ -551,6 +574,7 @@ static void walprop_sigusr2(SIGNAL_ARGS) { int save_errno = errno; + got_SIGUSR2 = true; SetLatch(MyLatch); errno = save_errno; @@ -598,7 +622,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp) return GetCurrentTimestamp(); } -static TimeLineID +TimeLineID walprop_pg_get_timeline_id(void) { #if PG_VERSION_NUM >= 150000 @@ -617,6 +641,20 @@ walprop_pg_load_libpqwalreceiver(void) wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } +static void +walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn) +{ + WalproposerShmemState *wps = wp->api.get_shmem_state(wp); + char donor_name[64]; + + pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port); + SpinLockAcquire(&wps->mutex); + memcpy(wps->donor_name, donor_name, sizeof(donor_name)); + memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo)); + wps->donor_lsn = donor_lsn; + SpinLockRelease(&wps->mutex); +} + /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -717,7 +755,6 @@ walprop_connect_start(Safekeeper *sk) { Assert(sk->conn == NULL); sk->conn = libpqwp_connect_start(sk->conninfo); - } static WalProposerConnectPollStatusType @@ -1091,7 +1128,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; - __attribute__((unused)) TimeLineID currTLI; + __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) @@ -1295,116 +1332,13 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* Download WAL before basebackup for logical walsenders from sk, if needed */ +/* + Used to download WAL before basebackup for logical walsenders from sk, no longer + needed because walsender always uses neon_walreader. + */ static bool WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - char conninfo[MAXCONNINFO]; - TimeLineID timeline; - XLogRecPtr startpos; - XLogRecPtr endpos; - - startpos = GetLogRepRestartLSN(wp); - if (startpos == InvalidXLogRecPtr) - return true; /* recovery not needed */ - endpos = wp->propEpochStartLsn; - - timeline = wp->greetRequest.timeline; - - if (!neon_auth_token) - { - memcpy(conninfo, sk->conninfo, MAXCONNINFO); - } - else - { - int written = 0; - - written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); - if (written > MAXCONNINFO || written < 0) - wpg_log(FATAL, "could not append password to the safekeeper connection string"); - } - -#if PG_MAJORVERSION_NUM < 16 - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); -#else - wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); -#endif - - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - sk->host, sk->port, - err))); - return false; - } - wpg_log(LOG, - "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - return true; } @@ -1545,7 +1479,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1960,8 +1894,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) static void walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { - HotStandbyFeedback hsFeedback; - bool needToAdvanceSlot = false; + HotStandbyFeedback hsFeedback; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; @@ -2095,22 +2029,25 @@ GetLogRepRestartLSN(WalProposer *wp) return lrRestartLsn; } -void SetNeonCurrentClusterSize(uint64 size) +void +SetNeonCurrentClusterSize(uint64 size) { pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); } -uint64 GetNeonCurrentClusterSize(void) +uint64 +GetNeonCurrentClusterSize(void) { return pg_atomic_read_u64(&walprop_shared->currentClusterSize); } -uint64 GetNeonCurrentClusterSize(void); +uint64 GetNeonCurrentClusterSize(void); static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, + .update_donor = walprop_pg_update_donor, .get_current_timestamp = walprop_pg_get_current_timestamp, .conn_error_message = walprop_error_message, .conn_status = walprop_status, diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c new file mode 100644 index 000000000000..93dce9de84b0 --- /dev/null +++ b/pgxn/neon/walsender_hooks.c @@ -0,0 +1,172 @@ +/*------------------------------------------------------------------------- + * + * walsender_hooks.c + * + * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for + * fetching WAL from safekeepers, which normal xlogreader can't do. + * + *------------------------------------------------------------------------- + */ +#include "walsender_hooks.h" +#include "postgres.h" +#include "fmgr.h" +#include "access/xlogdefs.h" +#include "replication/walsender.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "miscadmin.h" +#include "utils/wait_event.h" +#include "utils/guc.h" +#include "postmaster/interrupt.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +static NeonWALReader *wal_reader = NULL; +extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); +extern bool GetDonorShmem(XLogRecPtr *donor_lsn); + +static XLogRecPtr +NeonWALReadWaitForWAL(XLogRecPtr loc) +{ + while (!NeonWALReaderUpdateDonor(wal_reader)) + { + pg_usleep(1000); + CHECK_FOR_INTERRUPTS(); + } + + return WalSndWaitForWal(loc); +} + +static int +NeonWALPageRead( + XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf) +{ + XLogRecPtr rem_lsn; + + /* Wait for flush pointer to advance past our request */ + XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen); + int count; + + if (flushptr < targetPagePtr + reqLen) + return -1; + + /* Read at most XLOG_BLCKSZ bytes */ + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; + else + count = flushptr - targetPagePtr; + + /* + * Sometimes walsender requests non-monotonic sequences of WAL. If that's + * the case, we have to reset streaming from remote at the correct + * position. For example, walsender may try to verify the segment header + * when trying to read in the middle of it. + */ + rem_lsn = NeonWALReaderGetRemLsn(wal_reader); + if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn) + { + NeonWALReaderResetRemote(wal_reader); + } + + for (;;) + { + NeonWALReadResult res = NeonWALRead( + wal_reader, + readBuf, + targetPagePtr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * Setting ws_tli is required by the XLogReaderRoutine, it is used + * for segment name generation in error reports. + * + * ReadPageInternal updates ws_segno after calling cb on its own + * and XLogReaderRoutine description doesn't require it, but + * WALRead sets, let's follow it. + */ + xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli; + xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno; + + /* + * ws_file doesn't exist in case of remote read, and isn't used by + * xlogreader except by WALRead on which we don't rely anyway. + */ + return count; + } + if (res == NEON_WALREAD_ERROR) + { + elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s", + LSN_FORMAT_ARGS(targetPagePtr), + reqLen, + NeonWALReaderErrMsg(wal_reader)); + return -1; + } + + /* + * Res is WOULDBLOCK, so we wait on the socket, recreating event set + * if necessary + */ + { + + pgsocket sock = NeonWALReaderSocket(wal_reader); + uint32_t reader_events = NeonWALReaderEvents(wal_reader); + long timeout_ms = 1000; + + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + WaitLatchOrSocket( + MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, + sock, + timeout_ms, + WAIT_EVENT_WAL_SENDER_MAIN); + } + } +} + +static void +NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p) +{ + neon_wal_segment_open(wal_reader, nextSegNo, tli_p); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +static void +NeonWALReadSegmentClose(XLogReaderState *xlogreader) +{ + neon_wal_segment_close(wal_reader); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +void +NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) +{ + if (!wal_reader) + { + XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); + + if (epochStartLsn == 0) + { + elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!"); + } + wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] "); + } + xlr->page_read = NeonWALPageRead; + xlr->segment_open = NeonWALReadSegmentOpen; + xlr->segment_close = NeonWALReadSegmentClose; +} diff --git a/pgxn/neon/walsender_hooks.h b/pgxn/neon/walsender_hooks.h new file mode 100644 index 000000000000..2e3ce180f9bf --- /dev/null +++ b/pgxn/neon/walsender_hooks.h @@ -0,0 +1,7 @@ +#ifndef __WALSENDER_HOOKS_H__ +#define __WALSENDER_HOOKS_H__ + +struct XLogReaderRoutine; +void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr); + +#endif diff --git a/poetry.lock b/poetry.lock index 6ed64d28fc02..ef9f572b1757 100644 --- a/poetry.lock +++ b/poetry.lock @@ -158,6 +158,28 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + [[package]] name = "anyio" version = "4.3.0" @@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -798,24 +821,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -931,24 +956,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -1001,18 +1008,17 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "3.0.10" +version = "4.0.1" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, - {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, + {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, + {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, ] [package.dependencies] Flask = ">=0.9" -Six = "*" [[package]] name = "frozenlist" @@ -1243,13 +1249,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.3" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, - {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -1269,6 +1275,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1310,6 +1333,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1339,24 +1376,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1372,6 +1424,52 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1423,64 +1521,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1655,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1703,6 +1813,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1729,6 +1850,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1841,16 +1973,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1862,6 +1997,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -2116,28 +2361,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" -optional = false -python-versions = "*" -files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2182,7 +2405,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2217,6 +2439,94 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -2257,18 +2567,18 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" @@ -2367,22 +2677,18 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" optional = false -python-versions = ">=3" +python-versions = ">=3.8" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2612,13 +2918,13 @@ files = [ [[package]] name = "werkzeug" -version = "3.0.1" +version = "3.0.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"}, - {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"}, + {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, + {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, ] [package.dependencies] @@ -2653,16 +2959,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2900,4 +3196,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572" +content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 0e8d03906b2a..3002006aedac 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -40,6 +40,7 @@ hyper.workspace = true hyper1 = { package = "hyper", version = "1.2", features = ["server"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } +indexmap.workspace = true ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 3795e3b6089c..6a906b299b5b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; @@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -280,6 +280,7 @@ async fn auth_quirks( client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. @@ -305,6 +306,10 @@ async fn auth_quirks( if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } + + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => api.get_role_secret(ctx, &info).await?, @@ -417,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, + endpoint_rate_limiter: Arc, ) -> auth::Result> { use BackendType::*; @@ -428,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { "performing authentication using the console" ); - let credentials = - auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?; + let credentials = auth_quirks( + ctx, + &*api, + user_info, + client, + allow_cleartext, + config, + endpoint_rate_limiter, + ) + .await?; BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. @@ -539,7 +553,7 @@ mod tests { }, context::RequestMonitoring, proxy::NeonOptions, - rate_limiter::RateBucketInfo, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, scram::ServerSecret, stream::{PqStream, Stream}, }; @@ -699,10 +713,20 @@ mod tests { _ => panic!("wrong message"), } }); - - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG) - .await - .unwrap(); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -739,10 +763,20 @@ mod tests { frontend::password_message(b"my-secret-password", &mut write).unwrap(); client.write_all(&write).await.unwrap(); }); - - let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); handle.await.unwrap(); } @@ -780,9 +814,20 @@ mod tests { client.write_all(&write).await.unwrap(); }); - let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG) - .await - .unwrap(); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); assert_eq!(creds.info.endpoint, "my-endpoint"); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 0956aae6c0a5..be7d961b8c5c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; use proxy::redis::notifications; +use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; @@ -143,6 +144,9 @@ struct ProxyCliArgs { /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, /// Whether the auth rate limiter actually takes effect (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] auth_rate_limit_enabled: bool, @@ -153,7 +157,7 @@ struct ProxyCliArgs { #[clap(long, default_value_t = 64)] auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] redis_rps_limit: Vec, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] @@ -243,6 +247,12 @@ struct SqlOverHttpArgs { /// increase memory used by the pool #[clap(long, default_value_t = 128)] sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, } #[tokio::main] @@ -358,6 +368,10 @@ async fn main() -> anyhow::Result<()> { proxy::metrics::CancellationSource::FromClient, )); + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -366,6 +380,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); // TODO: rename the argument to something like serverless. @@ -380,6 +395,7 @@ async fn main() -> anyhow::Result<()> { serverless_listener, cancellation_token.clone(), cancellation_handler.clone(), + endpoint_rate_limiter.clone(), )); } @@ -552,11 +568,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); - let api = - console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); let api = console::provider::ConsoleBackend::Console(api); auth::BackendType::Console(MaybeOwned::Owned(api), ()) } @@ -599,6 +620,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { opt_in: args.sql_over_http.sql_over_http_pool_opt_in, max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 23266ac4effe..4433b3c1c2dd 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,7 +1,7 @@ use crate::{ auth::parse_endpoint_param, cancellation::CancelClosure, - console::{errors::WakeComputeError, messages::MetricsAuxInfo}, + console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, metrics::{Metrics, NumDbConnectionsGuard}, @@ -34,6 +34,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl UserFacingError for ConnectionError { @@ -57,6 +60,9 @@ impl UserFacingError for ConnectionError { None => err.to_string(), }, WakeComputeError(err) => err.to_string_client(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } _ => COULD_NOT_CONNECT.to_owned(), } } @@ -72,6 +78,7 @@ impl ReportableError for ConnectionError { ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), + ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 0c8e284d0b3a..b7ab2c00f90d 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,7 +2,7 @@ use crate::{ auth::{self, backend::AuthRateLimiter}, console::locks::ApiLocks, rate_limiter::RateBucketInfo, - serverless::GlobalConnPoolOptions, + serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, Host, }; use anyhow::{bail, ensure, Context, Ok}; @@ -56,6 +56,8 @@ pub struct TlsConfig { pub struct HttpConfig { pub request_timeout: tokio::time::Duration, pub pool_options: GlobalConnPoolOptions, + pub cancel_set: CancelSet, + pub client_conn_threshold: u64, } pub struct AuthenticationConfig { @@ -536,9 +538,9 @@ pub struct RetryConfig { impl RetryConfig { /// Default options for RetryConfig. - /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = - "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2"; /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. /// Cplane has timeout of 60s on each request. 8m7s in total. pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = @@ -592,7 +594,7 @@ impl ConcurrencyLockOptions { pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; /// Default options for [`crate::console::provider::ApiLocks`]. pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = - "shards=64,permits=50,epoch=10m,timeout=500ms"; + "shards=64,permits=10,epoch=10m,timeout=10ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index dfda29e0b141..3b996cdbd11f 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -12,6 +12,7 @@ use crate::{ compute, config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, + error::ReportableError, intern::ProjectIdInt, metrics::ApiLockMetrics, scram, EndpointCacheKey, @@ -30,6 +31,8 @@ pub mod errors { }; use thiserror::Error; + use super::ApiLockError; + /// A go-to error message which doesn't leak any detail. const REQUEST_FAILED: &str = "Console request failed"; @@ -76,7 +79,7 @@ pub mod errors { } http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") + format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") } _ => REQUEST_FAILED.to_owned(), }, @@ -211,8 +214,8 @@ pub mod errors { #[error("Too many connections attempts")] TooManyConnections, - #[error("Timeout waiting to acquire wake compute lock")] - TimeoutError, + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. @@ -222,17 +225,6 @@ pub mod errors { } } - impl From for WakeComputeError { - fn from(_: tokio::sync::AcquireError) -> Self { - WakeComputeError::TimeoutError - } - } - impl From for WakeComputeError { - fn from(_: tokio::time::error::Elapsed) -> Self { - WakeComputeError::TimeoutError - } - } - impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; @@ -245,7 +237,9 @@ pub mod errors { TooManyConnections => self.to_string(), - TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } } } } @@ -256,7 +250,7 @@ pub mod errors { WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, WakeComputeError::ApiError(e) => e.get_error_kind(), WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, - WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, + WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } @@ -456,6 +450,23 @@ pub struct ApiLocks { metrics: &'static ApiLockMetrics, } +#[derive(Debug, thiserror::Error)] +pub enum ApiLockError { + #[error("lock was closed")] + AcquireError(#[from] tokio::sync::AcquireError), + #[error("permit could not be acquired")] + TimeoutError(#[from] tokio::time::error::Elapsed), +} + +impl ReportableError for ApiLockError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service, + ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, + } + } +} + impl ApiLocks { pub fn new( name: &'static str, @@ -475,7 +486,7 @@ impl ApiLocks { }) } - pub async fn get_permit(&self, key: &K) -> Result { + pub async fn get_permit(&self, key: &K) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index ec66641d01df..7728d2cafa8c 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -26,7 +26,7 @@ pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub locks: &'static ApiLocks, - pub endpoint_rate_limiter: Arc, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -36,7 +36,7 @@ impl Api { endpoint: http::Endpoint, caches: &'static ApiCaches, locks: &'static ApiLocks, - endpoint_rate_limiter: Arc, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -46,7 +46,7 @@ impl Api { endpoint, caches, locks, - endpoint_rate_limiter, + wake_compute_endpoint_rate_limiter, jwt, } } @@ -283,7 +283,7 @@ impl super::Api for Api { // check rate limit if !self - .endpoint_rate_limiter + .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize().into(), 1) { return Err(WakeComputeError::TooManyConnections); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index e4e095d77d7a..5824b70df91a 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -19,6 +19,7 @@ use crate::{ metrics::{Metrics, NumClientConnectionsGuard}, protocol2::read_proxy_protocol, proxy::handshake::{handshake, HandshakeData}, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, EndpointCacheKey, }; @@ -61,6 +62,7 @@ pub async fn task_main( listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -86,6 +88,7 @@ pub async fn task_main( let cancellation_handler = Arc::clone(&cancellation_handler); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await{ @@ -123,6 +126,7 @@ pub async fn task_main( cancellation_handler, socket, ClientMode::Tcp, + endpoint_rate_limiter2, conn_gauge, ) .instrument(span.clone()) @@ -234,6 +238,7 @@ pub async fn handle_client( cancellation_handler: Arc, stream: S, mode: ClientMode, + endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { info!( @@ -243,7 +248,6 @@ pub async fn handle_client( let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - // let _client_gauge = metrics.client_connections.guard(proto); let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); @@ -286,6 +290,7 @@ pub async fn handle_client( &mut stream, mode.allow_cleartext(), &config.authentication_config, + endpoint_rate_limiter, ) .await { diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 36a05ba190e9..8dec1f1137a2 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -86,6 +86,8 @@ impl ShouldRetry for compute::ConnectionError { match self { compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + // the cache entry was not checked for validity + compute::ConnectionError::TooManyConnectionAttempts(_) => false, _ => true, } } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 3d9e94dd72dc..94b03e1ccc10 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { WakeupFailureKind::ApiConsoleOtherError } WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, - WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, + WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, }; Metrics::get() .proxy diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 5ba2c36436e2..b8c949069638 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo { } impl RateBucketInfo { - pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + pub const DEFAULT_SET: [Self; 3] = [ Self::new(300, Duration::from_secs(1)), Self::new(200, Duration::from_secs(60)), Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -266,7 +272,7 @@ mod tests { #[test] fn default_rate_buckets() { - let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET; + let mut defaults = RateBucketInfo::DEFAULT_SET; RateBucketInfo::validate(&mut defaults[..]).unwrap(); } @@ -333,11 +339,8 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = BucketRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_ENDPOINT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { limiter.check(i, 1); } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 1a0d1f7b0e60..f634ab4e982e 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -3,6 +3,7 @@ //! Handles both SQL over HTTP and SQL over Websockets. mod backend; +pub mod cancel_set; mod conn_pool; mod http_util; mod json; @@ -35,6 +36,7 @@ use crate::context::RequestMonitoring; use crate::metrics::Metrics; use crate::protocol2::read_proxy_protocol; use crate::proxy::run_until_cancelled; +use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; @@ -53,6 +55,7 @@ pub async fn task_main( ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -81,6 +84,7 @@ pub async fn task_main( let backend = Arc::new(PoolingBackend { pool: Arc::clone(&conn_pool), config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_config = match config.tls_config.as_ref() { @@ -109,20 +113,38 @@ pub async fn task_main( let conn_id = uuid::Uuid::new_v4(); let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - connections.spawn( - connection_handler( - config, - backend.clone(), - connections.clone(), - cancellation_handler.clone(), - cancellation_token.clone(), - server.clone(), - tls_acceptor.clone(), - conn, - peer_addr, - ) - .instrument(http_conn_span), - ); + let n_connections = Metrics::get() + .proxy + .client_connections + .sample(crate::metrics::Protocol::Http); + tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check"); + if n_connections > config.http_config.client_conn_threshold { + tracing::trace!("attempting to cancel a random connection"); + if let Some(token) = config.http_config.cancel_set.take() { + tracing::debug!("cancelling a random connection"); + token.cancel() + } + } + + let conn_token = cancellation_token.child_token(); + let conn = connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + conn_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span); + + connections.spawn(async move { + let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token); + conn.await + }); } connections.wait().await; @@ -144,6 +166,7 @@ async fn connection_handler( backend: Arc, connections: TaskTracker, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, server: Builder, tls_acceptor: TlsAcceptor, @@ -227,6 +250,7 @@ async fn connection_handler( session_id, peer_addr, http_request_token, + endpoint_rate_limiter.clone(), ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), @@ -243,6 +267,7 @@ async fn connection_handler( // On cancellation, trigger the HTTP connection handler to shut down. let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { Either::Left((_cancelled, mut conn)) => { + tracing::debug!(%peer_addr, "cancelling connection"); conn.as_mut().graceful_shutdown(); conn.await } @@ -266,6 +291,7 @@ async fn request_handler( peer_addr: IpAddr, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, + endpoint_rate_limiter: Arc, ) -> Result>, ApiError> { let host = request .headers() @@ -291,9 +317,15 @@ async fn request_handler( ws_connections.spawn( async move { - if let Err(e) = - websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host) - .await + if let Err(e) = websocket::serve_websocket( + config, + ctx, + websocket, + cancellation_handler, + endpoint_rate_limiter, + host, + ) + .await { error!("error in websocket connection: {e:#}"); } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 963913a260e1..6b79c123163a 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -10,11 +10,13 @@ use crate::{ console::{ errors::{GetAuthInfoError, WakeComputeError}, locks::ApiLocks, + provider::ApiLockError, CachedNodeInfo, }, context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + rate_limiter::EndpointRateLimiter, Host, }; @@ -23,6 +25,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; pub struct PoolingBackend { pub pool: Arc>, pub config: &'static ProxyConfig, + pub endpoint_rate_limiter: Arc, } impl PoolingBackend { @@ -38,6 +41,12 @@ impl PoolingBackend { if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); } + if !self + .endpoint_rate_limiter + .check(conn_info.user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } let cached_secret = match maybe_secret { Some(secret) => secret, None => backend.get_role_secret(ctx).await?, @@ -131,6 +140,8 @@ pub enum HttpConnError { AuthError(#[from] AuthError), #[error("wake_compute returned error")] WakeCompute(#[from] WakeComputeError), + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl ReportableError for HttpConnError { @@ -141,6 +152,7 @@ impl ReportableError for HttpConnError { HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), + HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), } } } @@ -153,6 +165,9 @@ impl UserFacingError for HttpConnError { HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), + HttpConnError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } } } } @@ -165,6 +180,15 @@ impl ShouldRetry for HttpConnError { HttpConnError::GetAuthInfo(_) => false, HttpConnError::AuthError(_) => false, HttpConnError::WakeCompute(_) => false, + HttpConnError::TooManyConnectionAttempts(_) => false, + } + } + fn should_retry_database_address(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.should_retry_database_address(), + // we never checked cache validity + HttpConnError::TooManyConnectionAttempts(_) => false, + _ => true, } } } diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs new file mode 100644 index 000000000000..390df7f4f7af --- /dev/null +++ b/proxy/src/serverless/cancel_set.rs @@ -0,0 +1,102 @@ +//! A set for cancelling random http connections + +use std::{ + hash::{BuildHasher, BuildHasherDefault}, + num::NonZeroUsize, + time::Duration, +}; + +use indexmap::IndexMap; +use parking_lot::Mutex; +use rand::{thread_rng, Rng}; +use rustc_hash::FxHasher; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; + +type Hasher = BuildHasherDefault; + +pub struct CancelSet { + shards: Box<[Mutex]>, + // keyed by random uuid, fxhasher is fine + hasher: Hasher, +} + +pub struct CancelShard { + tokens: IndexMap, +} + +impl CancelSet { + pub fn new(shards: usize) -> Self { + CancelSet { + shards: (0..shards) + .map(|_| { + Mutex::new(CancelShard { + tokens: IndexMap::with_hasher(Hasher::default()), + }) + }) + .collect(), + hasher: Hasher::default(), + } + } + + pub fn take(&self) -> Option { + for _ in 0..4 { + if let Some(token) = self.take_raw(thread_rng().gen()) { + return Some(token); + } + tracing::trace!("failed to get cancel token"); + } + None + } + + pub fn take_raw(&self, rng: usize) -> Option { + NonZeroUsize::new(self.shards.len()) + .and_then(|len| self.shards[rng % len].lock().take(rng / len)) + } + + pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { + let shard = NonZeroUsize::new(self.shards.len()).map(|len| { + let hash = self.hasher.hash_one(id) as usize; + let shard = &self.shards[hash % len]; + shard.lock().insert(id, token); + shard + }); + CancelGuard { shard, id } + } +} + +impl CancelShard { + fn take(&mut self, rng: usize) -> Option { + NonZeroUsize::new(self.tokens.len()).and_then(|len| { + // 10 second grace period so we don't cancel new connections + if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + return None; + } + + let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?; + Some(token) + }) + } + + fn remove(&mut self, id: uuid::Uuid) { + self.tokens.swap_remove(&id); + } + + fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) { + self.tokens.insert(id, (Instant::now(), token)); + } +} + +pub struct CancelGuard<'a> { + shard: Option<&'a Mutex>, + id: Uuid, +} + +impl Drop for CancelGuard<'_> { + fn drop(&mut self) { + if let Some(shard) = self.shard { + shard.lock().remove(self.id); + } + } +} diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 798e48850906..5fa253acf86d 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -716,7 +716,7 @@ impl Drop for Client { mod tests { use std::{mem, sync::atomic::AtomicBool}; - use crate::{BranchId, EndpointId, ProjectId}; + use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId}; use super::*; @@ -767,6 +767,8 @@ mod tests { max_total_conns: 3, }, request_timeout: Duration::from_secs(1), + cancel_set: CancelSet::new(0), + client_conn_threshold: u64::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index e856053a7e8c..5376bddfd314 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -424,8 +424,8 @@ pub enum SqlOverHttpCancel { impl ReportableError for SqlOverHttpCancel { fn get_error_kind(&self) -> ErrorKind { match self { - SqlOverHttpCancel::Postgres => ErrorKind::RateLimit, - SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit, + SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect, + SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect, } } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index b6cd85af7316..649bec2c7c69 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -5,6 +5,7 @@ use crate::{ error::{io_error, ReportableError}, metrics::Metrics, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -134,6 +135,7 @@ pub async fn serve_websocket( mut ctx: RequestMonitoring, websocket: HyperWebsocket, cancellation_handler: Arc, + endpoint_rate_limiter: Arc, hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; @@ -148,6 +150,7 @@ pub async fn serve_websocket( cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, + endpoint_rate_limiter, conn_gauge, ) .await; diff --git a/pyproject.toml b/pyproject.toml index aadcf26818b0..ac7f9b061c20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,17 +14,17 @@ requests = "^2.31.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" -Jinja2 = "^3.1.3" +Jinja2 = "^3.1.4" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "^3.0.1" +Werkzeug = "^3.0.3" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" pytest-asyncio = "^0.21.0" diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7c0f699958d2..dd64a0a98f89 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, )) } @@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors( // layer we think is missing. result.errors.push(format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), + layer, metadata.generation.get_suffix(), metadata.shard )) @@ -170,8 +170,7 @@ pub(crate) struct LayerRef { /// the tenant to query whether an object exists. #[derive(Default)] pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, } impl TenantObjectListing { @@ -180,7 +179,7 @@ impl TenantObjectListing { pub(crate) fn push( &mut self, ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, + layers: HashSet<(LayerName, Generation)>, ) { let shard_index = ShardIndex::new( ttid.tenant_shard_id.shard_number, @@ -208,7 +207,7 @@ impl TenantObjectListing { pub(crate) fn check_ref( &mut self, timeline_id: TimelineId, - layer_file: &LayerFileName, + layer_file: &LayerName, metadata: &IndexLayerMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { @@ -224,7 +223,7 @@ impl TenantObjectListing { true } - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { let mut result = Vec::new(); for ((shard_index, timeline_id), layers) in &self.shard_timelines { for ((layer_file, generation), layer_ref) in layers { @@ -247,25 +246,25 @@ pub(crate) struct S3TimelineBlobData { #[derive(Debug)] pub(crate) enum BlobDataParseResult { Parsed { - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, + s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { +fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; + let layer = layer_filename.parse::()?; let gen = Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; Ok((layer, gen)) } - _ => Ok((name.parse::()?, Generation::none())), + _ => Ok((name.parse::()?, Generation::none())), } } @@ -369,7 +368,7 @@ pub(crate) async fn list_timeline_blobs( Ok(index_part) => { return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Parsed { - index_part, + index_part: Box::new(index_part), index_part_generation, s3_layers, }, diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e976e66748d0..7966fb6a886b 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -312,7 +312,10 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let sleep_impl: Arc = Arc::new(TokioSleep::new()); let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) + .behavior_version( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) .region(bucket_region) .retry_config(RetryConfig::adaptive().with_max_attempts(3)) .sleep_impl(SharedAsyncSleep::from(sleep_impl)) diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs index 4eccad381b26..a24a1e92aea6 100644 --- a/s3_scrubber/src/tenant_snapshot.rs +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use utils::generation::Generation; @@ -48,16 +48,16 @@ impl SnapshotDownloader { async fn download_layer( &self, ttid: TenantShardTimelineId, - layer_name: LayerFileName, + layer_name: LayerName, layer_metadata: IndexLayerMetadata, - ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> { + ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> { // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use // different layer names (remote-style has the generation suffix) let local_path = self.output_path.join(format!( "{}/timelines/{}/{}{}", ttid.tenant_shard_id, ttid.timeline_id, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() )); @@ -76,7 +76,7 @@ impl SnapshotDownloader { let remote_layer_path = format!( "{}{}{}", timeline_root.prefix_in_bucket, - layer_name.file_name(), + layer_name, layer_metadata.generation.get_suffix() ); @@ -110,7 +110,7 @@ impl SnapshotDownloader { async fn download_layers( &self, ttid: TenantShardTimelineId, - layers: Vec<(LayerFileName, IndexLayerMetadata)>, + layers: Vec<(LayerName, IndexLayerMetadata)>, ) -> anyhow::Result<()> { let layer_count = layers.len(); tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); @@ -138,7 +138,7 @@ impl SnapshotDownloader { tracing::info!( "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", layer_metadata.file_size, - layer_name.file_name() + layer_name ); } Err(e) => { @@ -159,11 +159,11 @@ impl SnapshotDownloader { async fn download_timeline( &self, ttid: TenantShardTimelineId, - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, ancestor_layers: &mut HashMap< TenantShardTimelineId, - HashMap, + HashMap, >, ) -> anyhow::Result<()> { let index_bytes = serde_json::to_string(&index_part).unwrap(); @@ -234,7 +234,7 @@ impl SnapshotDownloader { // happen if this tenant has been split at some point) let mut ancestor_layers: HashMap< TenantShardTimelineId, - HashMap, + HashMap, > = Default::default(); for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9ce26e6c5d04..30d0081a47cc 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -519,6 +519,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7da5fd00b01e..59a8c595ab8a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -506,6 +506,8 @@ struct WalSender<'a, IO> { send_buf: [u8; MAX_SEND_SIZE], } +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs @@ -584,14 +586,22 @@ impl WalSender<'_, IO> { async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { self.end_pos = self.end_watch.get(); - if self.end_pos > self.start_pos { - // We have something to send. + let have_something_to_send = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { false } + ); + self.end_pos > self.start_pos + })(); + + if have_something_to_send { trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } // Wait for WAL to appear, now self.end_pos == self.start_pos. - if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? { + if let Some(lsn) = self.wait_for_lsn().await? { self.end_pos = lsn; trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); @@ -628,6 +638,54 @@ impl WalSender<'_, IO> { .await?; } } + + /// Wait until we have available WAL > start_pos or timeout expires. Returns + /// - Ok(Some(end_pos)) if needed lsn is successfully observed; + /// - Ok(None) if timeout expired; + /// - Err in case of error -- only if 1) term changed while fetching in recovery + /// mode 2) watch channel closed, which must never happen. + async fn wait_for_lsn(&mut self) -> anyhow::Result> { + let fp = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { true } + ); + false + })(); + if fp { + tokio::time::sleep(POLL_STATE_TIMEOUT).await; + return Ok(None); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + loop { + let end_pos = self.end_watch.get(); + if end_pos > self.start_pos { + return Ok(end_pos); + } + if let EndWatch::Flush(rx) = &self.end_watch { + let curr_term = rx.borrow().term; + if let Some(client_term) = self.term { + if curr_term != client_term { + bail!("term changed: requested {}, now {}", client_term, curr_term); + } + } + } + self.end_watch.changed().await?; + } + }) + .await; + + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), + } + } } /// A half driving receiving replies. @@ -685,47 +743,6 @@ impl ReplyReader { } } -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Wait until we have available WAL > start_pos or timeout expires. Returns -/// - Ok(Some(end_pos)) if needed lsn is successfully observed; -/// - Ok(None) if timeout expired; -/// - Err in case of error -- only if 1) term changed while fetching in recovery -/// mode 2) watch channel closed, which must never happen. -async fn wait_for_lsn( - rx: &mut EndWatch, - client_term: Option, - start_pos: Lsn, -) -> anyhow::Result> { - let res = timeout(POLL_STATE_TIMEOUT, async move { - loop { - let end_pos = rx.get(); - if end_pos > start_pos { - return Ok(end_pos); - } - if let EndWatch::Flush(rx) = rx { - let curr_term = rx.borrow().term; - if let Some(client_term) = client_term { - if curr_term != client_term { - bail!("term changed: requested {}, now {}", client_term, curr_term); - } - } - } - rx.changed().await?; - } - }) - .await; - - match res { - // success - Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), - // error inside closure - Ok(Err(err)) => Err(err), - // timeout - Err(_) => Ok(None), - } -} - #[cfg(test)] mod tests { use utils::id::{TenantId, TimelineId}; diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index c49495a4f3fa..5578c94cf6f8 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -17,8 +17,7 @@ use utils::lsn::Lsn; use walproposer::{ api_bindings::Level, bindings::{ - pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents, - WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }, walproposer::{ApiImpl, Config}, }; @@ -224,31 +223,13 @@ impl SimulationApi { }) .collect::>(); - let empty_feedback = PageserverFeedback { - present: false, - currentClusterSize: 0, - last_received_lsn: 0, - disk_consistent_lsn: 0, - remote_consistent_lsn: 0, - replytime: 0, - shard_number: 0, - }; - Self { os: args.os, safekeepers: RefCell::new(sk_conns), disk: args.disk, redo_start_lsn: args.redo_start_lsn, last_logged_commit_lsn: 0, - shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState { - mutex: 0, - mineLastElectedTerm: 0, - backpressureThrottlingTime: pg_atomic_uint64 { value: 0 }, - currentClusterSize: pg_atomic_uint64 { value: 0 }, - shard_ps_feedback: [empty_feedback; 128], - num_shards: 0, - min_ps_feedback: empty_feedback, - }), + shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()), config: args.config, event_set: RefCell::new(None), } @@ -274,6 +255,12 @@ impl ApiImpl for SimulationApi { self.os.now() as i64 * 1000 } + fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + } + fn conn_status( &self, _: &mut walproposer::bindings::Safekeeper, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d3a53066c976..ae7e8d3d7dfb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4745,7 +4745,7 @@ impl Service { // them in an optimization const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; - if progress.bytes_total == 0 + if progress.heatmap_mtime.is_none() || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD && progress.bytes_downloaded != progress.bytes_total || progress.bytes_total - progress.bytes_downloaded diff --git a/test_runner/README.md b/test_runner/README.md index 96e74659cea5..fd68cfff79bd 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -76,13 +76,10 @@ you can use `--pg-version` argument. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: -`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli - `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. @@ -95,6 +92,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 90884ad7f846..da379693a0ac 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -54,7 +54,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump +from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, @@ -68,7 +68,7 @@ RemoteStorageUser, S3Storage, default_remote_storage, - remote_storage_to_toml_inline_table, + remote_storage_to_toml_dict, ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent @@ -82,6 +82,7 @@ subprocess_capture, wait_until, ) +from fixtures.utils import AuxFileStore as AuxFileStore # reexport """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -450,6 +451,7 @@ def __init__( test_output_dir: Path, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, + # toml that will be decomposed into `--config-override` flags during `pageserver --init` pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, num_pageservers: int = 1, @@ -464,6 +466,7 @@ def __init__( initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, pageserver_virtual_file_io_engine: Optional[str] = None, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -487,6 +490,7 @@ def __init__( self.env: Optional[NeonEnv] = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath + self.neon_local_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files @@ -518,6 +522,8 @@ def __init__( self.pageserver_validate_vectored_get = bool(validate) log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') + self.pageserver_aux_file_policy = pageserver_aux_file_policy + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -563,6 +569,7 @@ def init_start( timeline_id=env.initial_timeline, shard_count=initial_tenant_shard_count, shard_stripe_size=initial_tenant_shard_stripe_size, + aux_file_v2=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -631,17 +638,11 @@ def _build_and_use_snapshot_impl( def from_repo_dir( self, repo_dir: Path, - neon_binpath: Optional[Path] = None, - pg_distrib_dir: Optional[Path] = None, ) -> NeonEnv: """ A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. """ - # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests - self.neon_binpath = neon_binpath or self.neon_binpath - self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir - # Get the initial tenant and timeline from the snapshot config snapshot_config_toml = repo_dir / "config" with snapshot_config_toml.open("r") as f: @@ -700,6 +701,11 @@ def from_repo_dir( config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + # FIXME: overriding pg_distrib_dir cause storage controller fail to start + # config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) @@ -981,7 +987,7 @@ class NeonEnv: Some notable functions and fields in NeonEnv: - postgres - A factory object for creating postgres compute nodes. + endpoints - A factory object for creating postgres compute nodes. pageservers - An array containing objects representing the pageservers @@ -1016,12 +1022,12 @@ def __init__(self, config: NeonEnvBuilder): self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath - # Binary path for neon_local test-specific binaries: may be overridden - # after construction for compat testing - self.neon_local_binpath = config.neon_binpath + # Binary path for neon_local test-specific binaries + self.neon_local_binpath = config.neon_local_binpath + if self.neon_local_binpath is None: + self.neon_local_binpath = self.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 - self.pageserver_config_override = config.pageserver_config_override self.storage_controller_config = config.storage_controller_config # generate initial tenant ID here instead of letting 'neon init' generate it, @@ -1051,15 +1057,16 @@ def __init__(self, config: NeonEnvBuilder): ) self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine + self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - # Create a config file corresponding to the options + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: @@ -1098,6 +1105,17 @@ def __init__(self, config: NeonEnvBuilder): if config.pageserver_validate_vectored_get is not None: ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value + # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( @@ -1131,7 +1149,10 @@ def __init__(self, config: NeonEnvBuilder): cfg["safekeepers"].append(sk_cfg) log.info(f"Config: {cfg}") - self.neon_cli.init(cfg, force=config.config_init_force) + self.neon_cli.init( + cfg, + force=config.config_init_force, + ) def start(self): # Storage controller starts first, so that pageserver /re-attach calls don't @@ -1283,6 +1304,7 @@ def _shared_simple_env( pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1313,6 +1335,7 @@ def _shared_simple_env( test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, + pageserver_aux_file_policy=pageserver_aux_file_policy, ) as builder: env = builder.init_start() @@ -1352,6 +1375,7 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1385,6 +1409,7 @@ def neon_env_builder( test_name=request.node.name, test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, + pageserver_aux_file_policy=pageserver_aux_file_policy, ) as builder: yield builder @@ -1544,6 +1569,7 @@ def create_tenant( shard_stripe_size: Optional[int] = None, placement_policy: Optional[str] = None, set_default: bool = False, + aux_file_v2: Optional[AuxFileStore] = None, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1567,6 +1593,16 @@ def create_tenant( product(["-c"], (f"{key}:{value}" for key, value in conf.items())) ) ) + + if aux_file_v2 is AuxFileStore.V2: + args.extend(["-c", "switch_aux_file_policy:v2"]) + + if aux_file_v2 is AuxFileStore.V1: + args.extend(["-c", "switch_aux_file_policy:v1"]) + + if aux_file_v2 is AuxFileStore.CrossValidation: + args.extend(["-c", "switch_aux_file_policy:cross_validation"]) + if set_default: args.append("--set-default") @@ -1701,32 +1737,24 @@ def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], force: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(toml.dumps(config)) - tmp.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() - cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] + cmd = [ + "init", + f"--config={init_config_tmpfile.name}", + ] if force is not None: cmd.extend(["--force", force]) - storage = self.env.pageserver_remote_storage - - append_pageserver_param_overrides( - params_to_update=cmd, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) - - s3_env_vars = None - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() - return res + return res def storage_controller_start(self): cmd = ["storage_controller", "start"] @@ -1741,16 +1769,10 @@ def storage_controller_stop(self, immediate: bool): def pageserver_start( self, id: int, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + start_args = ["pageserver", "start", f"--id={id}"] storage = self.env.pageserver_remote_storage - append_pageserver_param_overrides( - params_to_update=start_args, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) if isinstance(storage, S3Storage): s3_env_vars = storage.access_env_vars() @@ -1801,6 +1823,7 @@ def endpoint_create( hot_standby: bool = False, lsn: Optional[Lsn] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1824,6 +1847,8 @@ def endpoint_create( args.extend(["--hot-standby", "true"]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1835,6 +1860,7 @@ def endpoint_start( safekeepers: Optional[List[int]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1849,6 +1875,8 @@ def endpoint_start( args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -2408,9 +2436,42 @@ def tenant_dir( return self.workdir / "tenants" return self.workdir / "tenants" / str(tenant_shard_id) + @property + def config_toml_path(self) -> Path: + return self.workdir / "pageserver.toml" + + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + """ + Edit the pageserver's config toml file in place. + """ + path = self.config_toml_path + with open(path, "r") as f: + config = toml.load(f) + edit_fn(config) + with open(path, "w") as f: + toml.dump(config, f) + + def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: + """ + Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`. + Returns the replaced values. + If there was no previous value, the key is mapped to None. + This allows to restore the original value by calling this method with the returned dict. + """ + replacements = {} + + def doit(config: Dict[str, Any]): + while len(patch) > 0: + key, new = patch.popitem() + old = config.get(key, None) + config[key] = new + replacements[key] = old + + self.edit_config_toml(doit) + return replacements + def start( self, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "NeonPageserver": """ @@ -2420,9 +2481,7 @@ def start( """ assert self.running is False - self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars - ) + self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars) self.running = True return self @@ -2584,32 +2643,36 @@ def tenant_load(self, tenant_id: TenantId): tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) + def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. -def append_pageserver_param_overrides( - params_to_update: List[str], - remote_storage: Optional[RemoteStorage], - pageserver_config_override: Optional[str] = None, -): - if remote_storage is not None: - remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) - - params_to_update.append( - f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = self.timeline_dir(tenant_id, timeline_id) + + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) ) - else: - params_to_update.append('--pageserver-config-override=remote_storage=""') - - env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") - if env_overrides is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") - ] - if pageserver_config_override is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" - for o in pageserver_config_override.split(";") - ] + def layer_exists( + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName + ) -> bool: + layers = self.list_layers(tenant_id, timeline_id) + return layer_name in [parse_layer_file_name(p.name) for p in layers] class PgBin: @@ -3299,6 +3362,7 @@ def create( lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -3321,6 +3385,7 @@ def create( pg_port=self.pg_port, http_port=self.http_port, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -3337,7 +3402,10 @@ def create( return self def start( - self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + self, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Start the Postgres instance. @@ -3353,6 +3421,7 @@ def start( safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) self.running = True @@ -3482,6 +3551,7 @@ def create_start( config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3497,7 +3567,12 @@ def create_start( hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, - ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id) + allow_multiple=allow_multiple, + ).start( + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) log.info(f"Postgres startup took {time.time() - started_at} seconds") diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 8b895dcd9299..58a76d7586ae 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -88,7 +88,9 @@ def scan_pageserver_log_for_errors( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*", + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + # Can happen when the test shuts down the storage controller while it is calling the utilization API + ".*WARN.*path=/v1/utilization .*request was dropped before completing", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 231ffd898e0c..b06972056c10 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -819,6 +819,23 @@ def download_all_layers( continue self.download_layer(tenant_id, timeline_id, layer.layer_file_name) + def detach_ancestor( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + batch_size: int | None = None, + ) -> Set[TimelineId]: + params = {} + if batch_size is not None: + params["batch_size"] = batch_size + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", + params=params, + ) + self.verbose_error(res) + json = res.json() + return set(map(TimelineId, json["reparented_timelines"])) + def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str ): diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index 72fa30a2f2eb..1fb618f44523 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -1,3 +1,4 @@ +import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union @@ -11,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -25,7 +26,7 @@ def to_str(self): @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key @@ -40,65 +41,57 @@ def to_str(self) -> str: return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) + + def parse_image_layer(f_name: str) -> Tuple[int, int, int]: """Parse an image layer file name. Return key start, key end, and snapshot lsn""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - try: - return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + + match = IMAGE_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an image layer filename") + + return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16) + + +DELTA_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - lsn_parts = parts[1].split("-") - if len(lsn_parts) != 2: - raise InvalidFileName( - f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}" - ) - try: - return ( - int(key_parts[0], 16), - int(key_parts[1], 16), - int(lsn_parts[0], 16), - int(lsn_parts[1], 16), - ) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + match = DELTA_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an delta layer filename") + + return ( + int(match.group(1), 16), + int(match.group(2), 16), + int(match.group(3), 16), + int(match.group(4), 16), + ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -110,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -131,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index c8ab550ad700..77523a542b1f 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -5,6 +5,7 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion +from fixtures.utils import AuxFileStore """ Dynamically parametrize tests by different parameters @@ -31,6 +32,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_aux_file_policy() -> Optional[AuxFileStore]: + return None + + def pytest_generate_tests(metafunc: Metafunc): if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 83f9f26837a6..132d2450a7ec 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -50,7 +50,7 @@ def __init__( # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -141,11 +141,13 @@ def heatmap_content(self, tenant_id): with self.heatmap_path(tenant_id).open("r") as f: return json.load(f) - def to_toml_inline_table(self) -> str: - rv = { + def to_toml_dict(self) -> Dict[str, Any]: + return { "local_path": str(self.root), } - return toml.TomlEncoder().dump_inline_table(rv) + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -194,7 +196,7 @@ def to_string(self) -> str: } ) - def to_toml_inline_table(self) -> str: + def to_toml_dict(self) -> Dict[str, Any]: rv = { "bucket_name": self.bucket_name, "bucket_region": self.bucket_region, @@ -206,7 +208,10 @@ def to_toml_inline_table(self) -> str: if self.endpoint is not None: rv["endpoint"] = self.endpoint - return toml.TomlEncoder().dump_inline_table(rv) + return rv + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def do_cleanup(self): if not self.cleanup: @@ -414,6 +419,13 @@ def default_remote_storage() -> RemoteStorageKind: return RemoteStorageKind.LOCAL_FS +def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]: + if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): + raise Exception("invalid remote storage type") + + return remote_storage.to_toml_dict() + + # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 9365d65fc9e2..6470621900b3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,4 +1,5 @@ import contextlib +import enum import json import os import re @@ -484,3 +485,16 @@ def assert_no_errors(log_file, service, allowed_errors): log.info(f"not allowed {service} error: {error.strip()}") assert not errors, f"Log errors on {service}: {errors[0]}" + + +@enum.unique +class AuxFileStore(str, enum.Enum): + V1 = "V1" + V2 = "V2" + CrossValidation = "CrossValidation" + + def __repr__(self) -> str: + return f"'aux-{self.value}'" + + def __str__(self) -> str: + return f"'aux-{self.value}'" diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 54905759bd41..7687b8417fcb 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -140,10 +140,14 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier + def patch_default_tenant_config(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["compaction_period"] = "3s" + tenant_config["gc_period"] = "0s" + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(patch_default_tenant_config) env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", - ), # this does print more than we want, but the number should be comparable between runs extra_env_vars={ "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 17dc96dabe20..632d465c3f7d 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -102,6 +102,9 @@ def check_memory(): tenant_id, shard_count, stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) futs.append(f) diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index 50243e3ea768..edf2a01337ce 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -8,7 +8,7 @@ - + diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 59461cc09526..693add422f0c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, - "switch_to_aux_file_v2": True, + "switch_aux_file_policy": "CrossValidation", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e1ccb3e0c640..787c114fc1f7 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -233,17 +233,18 @@ def test_forward_compatibility( neon_env_builder.pageserver_validate_vectored_get = None neon_env_builder.num_safekeepers = 3 - neon_local_binpath = neon_env_builder.neon_binpath + + # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). + # But always use the current version's neon_local binary. + # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. + neon_env_builder.neon_binpath = compatibility_neon_bin + neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir + neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", - neon_binpath=compatibility_neon_bin, - pg_distrib_dir=compatibility_postgres_distrib_dir, ) - # Use current neon_local even though we're using old binaries for - # everything else: our test code is written for latest CLI args. - env.neon_local_binpath = neon_local_binpath - neon_env_builder.start() check_neon_works( diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index b83545216d8a..5e9efa7cce1b 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,7 +5,6 @@ from typing import Any, Dict, Iterable, Tuple import pytest -import toml from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -45,17 +44,16 @@ def assert_overrides(tenant_id, default_tenant_conf_value): ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, default_tenant_conf_value) - env.pageserver.stop() if config_level_override is not None: - env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ min_resident_size_override = " - + str(config_level_override) - + " }", - ) - ) - else: - env.pageserver.start() + + def set_min_resident_size(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["min_resident_size_override"] = config_level_override + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(set_min_resident_size) + env.pageserver.stop() + env.pageserver.start() tenant_id, _ = env.neon_cli.create_tenant() assert_overrides(tenant_id, config_level_override) @@ -164,34 +162,32 @@ def pageserver_start_with_disk_usage_eviction( usage eviction task is unknown; it might need to run one more iteration before assertions can be made. """ - disk_usage_config = { - "period": period, - "max_usage_pct": max_usage_pct, - "min_avail_bytes": min_avail_bytes, - "mock_statvfs": mock_behavior, - "eviction_order": eviction_order.config(), - } - - enc = toml.TomlEncoder() # these can sometimes happen during startup before any tenants have been # loaded, so nothing can be evicted, we just wait for next iteration which # is able to evict. pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") - pageserver.start( - overrides=( - "--pageserver-config-override=disk_usage_based_eviction=" - + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + pageserver.patch_config_toml_nonrecursive( + { + "disk_usage_based_eviction": { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), + }, # Disk usage based eviction runs as a background task. # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. # But, we only have a 10-second-timeout in this test. # So, disable the delay for this test. - "--pageserver-config-override=background_task_maximum_delay='0s'", - ), + "background_task_maximum_delay": "0s", + } ) + pageserver.start() + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction for tenant_id, timeline_id in self.timelines: tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index cb4fa43be724..7471338ce528 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -2,6 +2,7 @@ import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload_queue_empty, @@ -86,14 +87,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # path = env.remote_storage.timeline_path(tenant_id, timeline_id) l1_found = None - for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir(): - if path.name == "metadata" or path.name.startswith("ephemeral-"): - continue - - if len(path.suffixes) > 0: - # temp files - continue - + for path in env.pageserver.list_layers(tenant_id, timeline_id): [key_range, lsn_range] = path.name.split("__", maxsplit=1) if "-" not in lsn_range: @@ -108,19 +102,21 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) if l1_found is not None: raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}") - l1_found = path + l1_found = parse_layer_file_name(path.name) assert l1_found is not None, "failed to find L1 locally" uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" env.pageserver.start() wait_until_tenant_active(pageserver_http, tenant_id) - assert not l1_found.exists(), "partial compaction result should had been removed during startup" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, l1_found + ), "partial compaction result should had been removed during startup" # wait for us to catch up again wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) @@ -130,18 +126,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # give time for log flush time.sleep(1) - message = f".*duplicated L1 layer layer={l1_found.name}" + message = f".*duplicated L1 layer layer={l1_found}" found_msg = env.pageserver.log_contains(message) # resident or evicted, it should not be overwritten, however it should had been non-existing at startup assert ( found_msg is None ), "layer should had been removed during startup, did it live on as evicted?" - assert l1_found.exists(), "the L1 reappears" + assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index fefb30bbdd7b..b178baea11c2 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -7,6 +7,7 @@ flush_ep_to_pageserver, wait_for_last_flush_lsn, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind @@ -57,9 +58,9 @@ def test_basic_eviction( for sk in env.safekeepers: sk.stop() - timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) - initial_local_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + initial_local_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( len(initial_local_layers) > 1 @@ -73,6 +74,7 @@ def test_basic_eviction( assert len(initial_local_layers) == len( initial_layer_map_info.historic_layers ), "Should have the same layers in memory and on disk" + for returned_layer in initial_layer_map_info.historic_layers: assert ( returned_layer.kind == "Delta" @@ -81,27 +83,29 @@ def test_basic_eviction( not returned_layer.remote ), f"All created layers should be present locally, but got {returned_layer}" - local_layers = list( - filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers) - ) + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) assert ( - len(local_layers) == 1 - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" - local_layer = local_layers[0] + returned_layer_name in initial_local_layers + ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + + local_layer_path = ( + env.pageserver.timeline_dir(tenant_id, timeline_id) + / initial_local_layers[returned_layer_name] + ) assert ( - returned_layer.layer_file_size == local_layer.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}" + returned_layer.layer_file_size == local_layer_path.stat().st_size + ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map - for local_layer in initial_local_layers: + for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not any( - new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*") - ), f"Did not expect to find {local_layer} layer after evicting" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, local_layer_name + ), f"Did not expect to find {local_layer_name} layer after evicting" - empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) @@ -118,15 +122,15 @@ def test_basic_eviction( assert ( returned_layer.remote ), f"All layers should be evicted and not present locally, but got {returned_layer}" - assert any( - local_layer.name == returned_layer.layer_file_name - for local_layer in initial_local_layers + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" # redownload all evicted layers and ensure the initial state is restored - for local_layer in initial_local_layers: + for local_layer_name, _local_layer_path in initial_local_layers.items(): client.download_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str() ) client.timeline_download_remote_layers( tenant_id, @@ -137,8 +141,9 @@ def test_basic_eviction( at_least_one_download=False, ) - redownloaded_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + redownloaded_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( redownloaded_layers == initial_local_layers @@ -154,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index f311a8bf2c99..cc34fd83e905 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -3,8 +3,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -81,7 +81,7 @@ def get_future_layers(): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -146,7 +146,7 @@ def get_future_layers(): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 1bac528397d7..57d3447cae7b 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -6,7 +6,9 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + AuxFileStore, NeonEnv, + NeonEnvBuilder, logical_replication_sync, wait_for_last_flush_lsn, ) @@ -18,6 +20,19 @@ def random_string(n: int): return "".join([choice(ascii_lowercase) for _ in range(n)]) +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation] +) +def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(env.initial_tenant).effective_config + assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -159,6 +174,9 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): # Test that neon.logical_replication_max_snap_files works +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): def slot_removed(ep): assert ( @@ -203,8 +221,86 @@ def slot_removed(ep): wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) +# Tests that walsender correctly blocks until WAL is downloaded from safekeepers +def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + cur.execute("create table t(a int)") + cur.execute("create publication pub for table t") + cur.execute("insert into t values (1)") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(a int)") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + + # Pause the safekeepers so that they can't send WAL (except to pageserver) + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "return")]) + + # Insert a 2 + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (2)") + + endpoint.stop_and_destroy() + + # This new endpoint should contain [1, 2], but it can't access WAL from safekeeper + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + cur.execute("select * from t") + res = [r[0] for r in cur.fetchall()] + assert res == [1, 2] + + # Reconnect subscriber + vanilla_pg.start() + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'") + + time.sleep(5) + # Make sure the 2 isn't replicated + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1] + + # Re-enable WAL download + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "off")]) + + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] + + # Check that local reads also work + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (3)") + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] + + log_path = vanilla_pg.pgdatadir / "pg.log" + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "could not receive data from WAL stream" not in logs + + # Test compute start at LSN page of which starts with contrecord # https://github.com/neondatabase/neon/issues/5749 +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -295,6 +391,9 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): # logical replication bug as such, but without logical replication, # records passed ot the WAL redo process are never large enough to hit # the bug. +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -366,6 +465,9 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 5c99ca6733ee..225622868dce 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -119,11 +119,11 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. + # Enable synchronous commit as we are timing sensitive # # Each row contains current insert LSN and the current timestamp, when # the row was inserted. - cur.execute("SET synchronous_commit=off") + cur.execute("SET synchronous_commit=on") cur.execute("CREATE TABLE foo (x integer)") tbl = [] for i in range(1000): @@ -132,7 +132,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) - time.sleep(0.005) + time.sleep(0.02) # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 81aed704bb95..bd7e4f118fed 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Optional +import toml from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -12,10 +13,11 @@ from fixtures.utils import wait_until -# test that we cannot override node id after init -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): +def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): + """ + NB: The neon_local doesn't use `--init` mode anymore, but our production + deployment still does => https://github.com/neondatabase/aws/pull/1322 + """ workdir = neon_simple_env.pageserver.workdir pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" @@ -29,18 +31,36 @@ def run_pageserver(args): stderr=subprocess.PIPE, ) - # remove initial config and stop existing pageserver - pageserver_config.unlink() neon_simple_env.pageserver.stop() - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + with open(neon_simple_env.pageserver.config_toml_path, "r") as f: + ps_config = toml.load(f) + + required_config_keys = [ + "pg_distrib_dir", + "listen_pg_addr", + "listen_http_addr", + "pg_auth_type", + "http_auth_type", + ] + required_config_overrides = [ + f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys + ] + + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", *required_config_overrides]) assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + good_init_cmd = [ + "--init", + f"--config-override=id={ps_config['id']}", + *required_config_overrides, + ] completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 @@ -49,11 +69,7 @@ def run_pageserver(args): bad_reinit = run_pageserver(good_init_cmd) assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" - assert "already exists, cannot init it" in bad_reinit.stderr - - bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) - assert bad_update.returncode == 1, "pageserver should not allow updating node id" - assert "has node id already, it cannot be overridden" in bad_update.stderr + assert "config file already exists" in bad_reinit.stderr def check_client(env: NeonEnv, client: PageserverHttpClient): diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 67f68a62aff8..58eaf404d3f7 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -10,6 +10,7 @@ """ import enum +import os import re import time from typing import Optional @@ -220,7 +221,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + replaced_config = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": "", + } + ) + env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( @@ -251,8 +257,8 @@ def parse_generation_suffix(key): assert parse_generation_suffix(key) is None env.pageserver.stop() - # Starting without the override that disabled control_plane_api + env.pageserver.patch_config_toml_nonrecursive(replaced_config) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) @@ -525,9 +531,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # incident, but it might be unavoidable: if so, we want to be able to start up # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP - env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + replaced = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageserver.start() # The pageserver should provide service to clients generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) @@ -549,6 +558,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # The pageserver should work fine when subsequently restarted in non-emergency mode env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.patch_config_toml_nonrecursive(replaced) env.pageserver.start() generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) @@ -691,3 +701,50 @@ def test_multi_attach( # All data we wrote while multi-attached remains readable workload.validate(pageservers[2].id) + + +@pytest.mark.skip(reason="To be enabled after release with new local path style") +def test_upgrade_generationless_local_file_paths( + neon_env_builder: NeonEnvBuilder, +): + """ + Test pageserver behavior when startup up with local layer paths without + generation numbers: it should accept these layer files, and avoid doing + a delete/download cycle on them. + """ + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(1000) + + env.pageserver.stop() + + # Rename the local paths to legacy format, to simulate what + # we would see when upgrading + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 + + assert files_renamed > 0 + + env.pageserver.start() + + workload.validate() + + # Assert that there were no on-demand downloads + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_layers_total" + ) + == 0 + ) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 753898f747b4..759e845927d4 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -20,7 +20,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -55,7 +58,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.start() # We reloaded our tenant - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8f194e5ddad9..c40bb962f251 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -2,12 +2,12 @@ import os import random import time -from pathlib import Path from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, poll_for_remote_storage_iterations, @@ -51,9 +51,13 @@ def evict_random_layers( if "ephemeral" in layer.name or "temp_download" in layer.name: continue + layer_name = parse_layer_file_name(layer.name) + if rng.choice([True, False]): - log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") - client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}") + client.evict_layer( + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str() + ) @pytest.mark.parametrize("seed", [1, 2, 3]) @@ -402,32 +406,6 @@ def validate_heatmap(heatmap): validate_heatmap(heatmap_second) -def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: - """ - Inspect local storage on a pageserver to discover which layer files are present. - - :return: list of relative paths to layers, from the timeline root. - """ - timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) - - def relative(p: Path) -> Path: - return p.relative_to(timeline_path) - - return sorted( - list( - map( - relative, - filter( - lambda path: path.name != "metadata" - and "ephemeral" not in path.name - and "temp" not in path.name, - timeline_path.glob("*"), - ), - ) - ) - ) - - def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -482,8 +460,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) # Make changes on attached pageserver, check secondary downloads them @@ -500,8 +478,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) try: - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) except: # Do a full listing of the secondary location on errors, to help debug of @@ -523,8 +501,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1] + layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] + some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -537,13 +515,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] ) assert layer_to_evict.name not in heatmap_layers - assert some_other_layer.name in heatmap_layers + assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) except: # On assertion failures, log some details to help with debugging @@ -630,7 +608,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): for timeline_id in timelines: log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") # One or more layers should be present for all timelines - assert list_layers(ps_secondary, tenant_id, timeline_id) + assert ps_secondary.list_layers(tenant_id, timeline_id) # Delete the second timeline: this should be reflected later on the secondary env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) @@ -645,10 +623,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary = next(p for p in env.pageservers if p != ps_attached) # This one was not deleted - assert list_layers(ps_secondary, tenant_id, timelines[0]) + assert ps_secondary.list_layers(tenant_id, timelines[0]) # This one was deleted - assert not list_layers(ps_secondary, tenant_id, timelines[1]) + assert not ps_secondary.list_layers(tenant_id, timelines[1]) t_end = time.time() @@ -708,7 +686,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) # Expect lots of layers - assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10 + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py new file mode 100644 index 000000000000..03e8c7c0dfd0 --- /dev/null +++ b/test_runner/regress/test_postgres_version.py @@ -0,0 +1,35 @@ +import json +import re +from pathlib import Path + +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + + +def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): + """Test that Postgres version matches the one we expect""" + + with (base_dir / "vendor" / "revisions.json").open() as f: + expected_revisions = json.load(f) + + output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False) + stdout = Path(f"{output_prefix}.stdout") + assert stdout.exists(), "postgres --version didn't print anything to stdout" + + with stdout.open() as f: + output = f.read().strip() + + # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". + pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + match = re.search(pattern, output, re.IGNORECASE) + assert match is not None, f"Can't parse {output} with {pattern}" + + version = match.group("version") + commit = match.group("commit") + + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 47200a856e0d..70c025c225de 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,6 +1,3 @@ -# It's possible to run any regular test with the local fs remote storage via -# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... - import os import queue import shutil @@ -15,6 +12,7 @@ wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -832,8 +830,9 @@ def test_compaction_waits_for_upload( assert len(upload_stuck_layers) > 0 for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "while uploads are stuck the layers should be present on disk" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "while uploads are stuck the layers should be present on disk" # now this will do the L0 => L1 compaction and want to remove # upload_stuck_layers and the original initdb L0 @@ -841,8 +840,9 @@ def test_compaction_waits_for_upload( # as uploads are paused, the upload_stuck_layers should still be with us for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "uploads are stuck still over compaction" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "uploads are stuck still over compaction" compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() overlap = compacted_layers.intersection(upload_stuck_layers) @@ -876,9 +876,8 @@ def until_layer_deletes_completed(): wait_until(10, 1, until_layer_deletes_completed) for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert ( - not path.exists() + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" # We should not have hit the error handling path in uploads where a uploaded file is gone diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 611bd1c2a280..9227836862d3 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -47,7 +47,7 @@ def test_tenant_s3_restore( tenant_id = env.initial_tenant # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 # create two timelines one being the parent of another, both with non-trivial data parent = None @@ -72,13 +72,13 @@ def test_tenant_s3_restore( time.sleep(4) assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 ), "tenant removed before we deletion was issued" iterations = poll_for_remote_storage_iterations(remote_storage_kind) tenant_delete_wait_completed(ps_http, tenant_id, iterations) ps_http.deletion_queue_flush(execute=True) assert ( - ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 ), "tenant removed before we deletion was issued" env.storage_controller.attach_hook_drop(tenant_id) @@ -116,4 +116,4 @@ def test_tenant_s3_restore( # There might be some activity that advances the lsn so we can't use a strict equality check assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old" - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 258377f8a299..d33803250f2d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -177,6 +177,67 @@ def test_sharding_split_unsharded( env.storage_controller.consistency_check() +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): + """ + Test that after a split, we clean up parent layer data in the child shards via compaction. + """ + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "3600s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + workload.stop() + + # Split one shard into two + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + workload.stop() + + env.storage_controller.consistency_check() + + # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id) + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because some layers have been dropped + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Compaction shouldn't make anything unreadable + workload.validate() + + def test_sharding_split_smoke( neon_env_builder: NeonEnvBuilder, ): diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index fdcb4cf9a4a0..bdd356388fe7 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -290,9 +290,12 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # This is the pageserver where we'll initially create the tenant. Run it in emergency # mode so that it doesn't talk to storage controller, and do not register it. env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") - env.pageservers[0].start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",), + env.pageservers[0].patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageservers[0].start() origin_ps = env.pageservers[0] # These are the pageservers managed by the sharding service, where the tenant diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c115c0375b9d..363c3c88ec71 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -64,7 +64,7 @@ def test_tenant_delete_smoke( ) # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 # create two timelines one being the parent of another parent = None @@ -90,9 +90,9 @@ def test_tenant_delete_smoke( iterations = poll_for_remote_storage_iterations(remote_storage_kind) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 tenant_delete_wait_completed(ps_http, tenant_id, iterations) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() @@ -108,7 +108,7 @@ def test_tenant_delete_smoke( ) # Deletion updates the tenant count: the one default tenant remains - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 class Check(enum.Enum): @@ -532,7 +532,9 @@ def hit_run_failpoint(): # The TenantSlot is still present while the original request is hung before # final removal - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + ) # Permit the original request to run to success ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) @@ -556,7 +558,8 @@ def hit_run_failpoint(): ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 def test_tenant_delete_races_timeline_creation( @@ -673,7 +676,7 @@ def deletion_arrived(): ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 53da5485248a..e73eae91f004 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -668,9 +668,9 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): client.configure_failpoints((failpoint, "off")) - with pytest.raises( - PageserverApiException, match="Failed to refresh gc_info before gathering inputs" - ): + # accept both, because the deletion might still complete before + matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)" + with pytest.raises(PageserverApiException, match=matcher): completion.result() # this happens on both cases diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d16978d02ac0..a1e96928bfd8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -18,6 +18,7 @@ NeonEnvBuilder, last_flush_lsn_upload, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -246,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup( # ensure the same size is found from the index_part.json index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) - assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + assert ( + index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"] + == expected_size + ) ## Start the pageserver. It will notice that the file size doesn't match, and ## rename away the local file. It will be re-downloaded when it's needed. @@ -276,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup( # the remote side of local_layer_truncated remote_layer_path = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, path.name + tenant_id, timeline_id, parse_layer_file_name(path.name).to_str() ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py new file mode 100644 index 000000000000..075f0a6bbce9 --- /dev/null +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -0,0 +1,402 @@ +import datetime +import enum +from concurrent.futures import ThreadPoolExecutor +from queue import Empty, Queue +from threading import Barrier +from typing import List + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo +from fixtures.pageserver.utils import wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage +from fixtures.types import Lsn, TimelineId + + +def by_end_lsn(info: HistoricLayerInfo) -> Lsn: + assert info.lsn_end is not None + return Lsn(info.lsn_end) + + +def layer_name(info: HistoricLayerInfo) -> str: + return info.layer_file_name + + +@enum.unique +class Branchpoint(str, enum.Enum): + """ + Have branches at these Lsns possibly relative to L0 layer boundary. + """ + + EARLIER = "earlier" + AT_L0 = "at" + AFTER_L0 = "after" + LAST_RECORD_LSN = "head" + + def __str__(self) -> str: + return self.value + + @staticmethod + def all() -> List["Branchpoint"]: + return [ + Branchpoint.EARLIER, + Branchpoint.AT_L0, + Branchpoint.AFTER_L0, + Branchpoint.LAST_RECORD_LSN, + ] + + +SHUTDOWN_ALLOWED_ERRORS = [ + ".*initial size calculation failed: downloading failed, possibly for shutdown", + ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", +] + + +@pytest.mark.parametrize("branchpoint", Branchpoint.all()) +@pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) +def test_ancestor_detach_branched_from( + neon_env_builder: NeonEnvBuilder, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, +): + """ + Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. + """ + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + + after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + # create a single layer for us to remote copy + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() + # there is also the in-mem layer, but ignore it for now + assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed" + later_delta = max(deltas, key=by_end_lsn) + assert later_delta.lsn_end is not None + + # -1 as the lsn_end is exclusive. + last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1 + + if branchpoint == Branchpoint.EARLIER: + branch_at = after_first_tx + rows = 0 + truncated_layers = 1 + elif branchpoint == Branchpoint.AT_L0: + branch_at = Lsn(last_lsn) + rows = 8192 + truncated_layers = 0 + elif branchpoint == Branchpoint.AFTER_L0: + branch_at = Lsn(last_lsn + 8) + rows = 8192 + # as there is no 8 byte walrecord, nothing should get copied from the straddling layer + truncated_layers = 0 + else: + # this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet + assert branchpoint == Branchpoint.LAST_RECORD_LSN + branch_at = None + rows = 16384 + truncated_layers = 0 + + name = "new main" + + timeline_id = env.neon_cli.create_branch( + name, "main", env.initial_tenant, ancestor_start_lsn=branch_at + ) + + recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"]) + if branch_at is None: + # fix it up if we need it later (currently unused) + branch_at = recorded + else: + assert branch_at == recorded, "the test should not use unaligned lsns" + + if write_to_branch_first: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + # make sure the ep is writable + # with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) + + # branch must have a flush for "PREV_LSN: none" + client.timeline_checkpoint(env.initial_tenant, timeline_id) + branch_layers = set( + map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers) + ) + else: + branch_layers = set() + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == set() + + if restart_after: + env.pageserver.stop() + env.pageserver.start() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384 + + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + old_main = set(map(layer_name, old_main_info.historic_layers)) + + new_main_info = client.layer_map_info(env.initial_tenant, timeline_id) + new_main = set(map(layer_name, new_main_info.historic_layers)) + + new_main_copied_or_truncated = new_main - branch_layers + new_main_truncated = new_main_copied_or_truncated - old_main + + assert len(new_main_truncated) == truncated_layers + # could additionally check that the symmetric difference has layers starting at the same lsn + # but if nothing was copied, then there is no nice rule. + # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): + """ + The case from RFC: + + +-> another branch with same ancestor_lsn as new main + | + old main -------|---------X---------> + | | | + | | +-> after + | | + | +-> new main + | + +-> reparented + + Ends up as: + + old main ---------------------------> + | + +-> after + + +-> another branch with same ancestor_lsn as new main + | + new main -------|---------|-> + | + +-> reparented + + We confirm the end result by being able to delete "old main" after deleting "after". + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + # as this only gets reparented, we don't need to write to it like new main + reparented = env.neon_cli.create_branch( + "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe + ) + + same_branchpoint = env.neon_cli.create_branch( + "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == {reparented, same_branchpoint} + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("new main", timeline_id, None, 8192, 1), + ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), + ("reparented", reparented, timeline_id, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert TimelineId(ancestor_timeline_id) == expected_ancestor + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the timelines to confirm detach actually worked + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): + """ + Makes sure that the timeline is able to receive writes through-out the detach process. + """ + + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + # row counts have been manually verified to cause reconnections and getpage + # requests when restart_after=False with pg16 + def insert_rows(n: int, ep) -> int: + ep.safe_psql( + f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);" + ) + return n + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);") + + rows = insert_rows(256, ep) + + branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint + ) + + log.info("starting the new main endpoint") + ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + def small_txs(ep, queue: Queue[str], barrier): + extra_rows = 0 + + with ep.connect() as conn: + while True: + try: + queue.get_nowait() + break + except Empty: + pass + + if barrier is not None: + barrier.wait() + barrier = None + + cursor = conn.cursor() + cursor.execute( + "INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);" + ) + extra_rows += 1 + return extra_rows + + with ThreadPoolExecutor(max_workers=1) as exec: + queue: Queue[str] = Queue() + barrier = Barrier(2) + + completion = exec.submit(small_txs, ep, queue, barrier) + barrier.wait() + + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert len(reparented) == 0 + + env.pageserver.quiesce_tenants() + + queue.put("done") + extra_rows = completion.result() + assert extra_rows > 0, "some rows should had been written" + rows += extra_rows + + assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None + + assert ep.safe_psql("SELECT clear_buffer_cache();") + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 + ep.stop() + + # finally restart the endpoint and make sure we still have the same answer + with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + +# TODO: +# - after starting the operation, tenant is deleted +# - after starting the operation, pageserver is shutdown, restarted +# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited +# - deletion of reparented while reparenting should fail once, then succeed (?) +# - branch near existing L1 boundary, image layers? +# - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 06f2a8befda5..b549db1af637 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # The VM page in shared buffer cache, and the same page as reconstructed # by the pageserver, should be equal. # - # Ignore the LSN on the page though (first 8 bytes). If the dirty - # VM page is flushed from the cache for some reason, it gets WAL-logged, - # which changes the LSN on the page. + # Ignore page header (24 bytes) of visibility map. + # If the dirty VM page is flushed from the cache for some reason, + # it gets WAL-logged, which changes the LSN on the page. + # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page. cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") - vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex() cur.execute( "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" ) - vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex() assert vm_page_at_pageserver == vm_page_in_cache diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 5902eb321795..dce5616ac69e 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -254,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): ) -def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): +def endpoint_create_start( + env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False +): endpoint = Endpoint( env, tenant_id=env.initial_tenant, @@ -268,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): # embed current time in endpoint ID endpoint_id = pgdir_name or f"ep-{time.time()}" return endpoint.create_start( - branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] + branch_name=branch, + endpoint_id=endpoint_id, + config_lines=["log_statement=all"], + allow_multiple=allow_multiple, ) async def exec_compute_query( - env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None + env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None, + allow_multiple: bool = False, ): - with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: + with endpoint_create_start( + env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple + ) as endpoint: before_conn = time.time() conn = await endpoint.connect_async() res = await conn.fetch(query) @@ -347,6 +358,7 @@ async def run(self): self.branch, f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", pgdir_name=f"bgcompute{self.index}_key{verify_key}", + allow_multiple=True, ) log.info(f"result: {res}") if len(res) != 1: diff --git a/vendor/revisions.json b/vendor/revisions.json index a353fde8fdf6..c5b55762fa53 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46", - "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a", - "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a" + "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"], + "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"], + "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 41ca16f16b20..e9d983eba3b6 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -244,6 +244,55 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then pg_last_wal_replay_lsn() + else pg_current_wal_lsn() + end as lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + query: | + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoints_req + type: gauge + help: 'Number of requested checkpoints' + key_labels: + values: [checkpoints_req] + query: | + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; - filename: neon_collector_autoscaling.yml content: | collector_name: neon_collector_autoscaling @@ -295,7 +344,6 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; - build: | # Build cgroup-tools # diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b2da33e44a28..b605757f64c4 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] }