From 75e91684d434b9fea12d822032146e523beb62d0 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 10 Sep 2024 13:42:50 +0000 Subject: [PATCH] Test and fix recovery with snapshot without ledger --- src/host/ledger.h | 10 +++++++++- tests/recovery.py | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/src/host/ledger.h b/src/host/ledger.h index 37ecd504f3e8..f5fb4645c0f1 100644 --- a/src/host/ledger.h +++ b/src/host/ledger.h @@ -1392,8 +1392,16 @@ namespace asynchost LOG_DEBUG_FMT("Ledger truncate: {}/{}", idx, last_idx); - if (idx >= last_idx || idx < committed_idx) + // Conservative check to avoid truncating to future indices, or dropping + // committed entries. If the ledger is being initialised from a snapshot + // alone, the first truncation effectively sets the last index. + if (last_idx != 0 && (idx >= last_idx || idx < committed_idx)) { + LOG_DEBUG_FMT( + "Ignoring truncate to {} - last_idx: {}, committed_idx: {}", + idx, + last_idx, + committed_idx); return; } diff --git a/tests/recovery.py b/tests/recovery.py index 72040d543b8f..2f170262b493 100644 --- a/tests/recovery.py +++ b/tests/recovery.py @@ -45,7 +45,7 @@ def get_and_verify_historical_receipt(network, ref_msg): @reqs.description("Recover a service") @reqs.recover(number_txs=2) -def test_recover_service(network, args, from_snapshot=True): +def test_recover_service(network, args, from_snapshot=True, no_ledger=False): network.save_service_identity(args) old_primary, _ = network.find_primary() @@ -71,7 +71,11 @@ def test_recover_service(network, args, from_snapshot=True): watcher.wait_for_recovery() - current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() + if no_ledger: + current_ledger_dir = None + committed_ledger_dirs = None + else: + current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() with tempfile.NamedTemporaryFile(mode="w+") as node_data_tf: start_node_data = {"this is a": "recovery node"} @@ -116,7 +120,7 @@ def test_recover_service(network, args, from_snapshot=True): args.initial_service_cert_validity_days ) - new_nodes = recovered_network.find_primary_and_any_backup() + new_nodes = recovered_network.get_joined_nodes() for n in new_nodes: with n.client() as c: r = c.get("/node/service/previous_identity") @@ -811,6 +815,26 @@ def run(args): ) +def run_recover_snapshot_alone(args): + """ + Recover a service from a snapshot alone, without any ledger files from a previous service. + """ + txs = app.LoggingTxs("user0") + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + args.perf_nodes, + pdb=args.pdb, + txs=txs, + ) as network: + network.start_and_open(args) + primary, _ = network.find_primary() + # Recover node solely from snapshot + test_recover_service(network, args, from_snapshot=True, no_ledger=True) + return network + + if __name__ == "__main__": def add(parser): @@ -860,4 +884,11 @@ def add(parser): snapshot_tx_interval=1000000, ) + cr.add( + "recovery_snapshot_alone", + run_recover_snapshot_alone, + package="samples/apps/logging/liblogging", + nodes=infra.e2e_args.min_nodes(cr.args, f=0), # 1 node suffices for recovery + ) + cr.run()