From 96a0cb40b315e23504d5ba64ded36cceaafd7a12 Mon Sep 17 00:00:00 2001 From: Minghui Yang Date: Fri, 9 Aug 2024 23:59:11 +0000 Subject: [PATCH] [BACKPORT 2024.1][#23459] YSQL: Fix asan YbAdminSnapshotScheduleTest.SysCatalogRetention failure Summary: The test YbAdminSnapshotScheduleTest.SysCatalogRetention has been failing consistently on asan build. Example: ``` [m-3] F0805 18:54:37.246038 218452 operation_driver.cc:423] T 00000000000000000000000000000000 P 0deac4231641432685e6356797ea60b4 S RD-P Ts { days: 19940 time: 18:54:36.777167 } kSnapshot (0x000050e00004cf60): Apply failed: Not found (yb/master/restore_sys_catalog_state.cc:404): Restore sys catalog failed: Determine restoring entries failed: Not found restoring table: 000040000000300080000000000040c3, request: operation: RESTORE_SYS_CATALOG snapshot_id: "7FD5714E4B99435C8ED1CBDCA67C1A0B" snapshot_hybrid_time: 7056991983307026432 restoration_id: "C81E641E7EE74F7EAAD967F031B19689" ``` I found it is possible that `DeleteTableInternal` can be called more than once in DDL atomicity helper function `CatalogManager::YsqlDdlTxnDropTableHelper`. DDL atomicity handling is async and can be repeatedly triggered by client query whether the DDL txn is done. While re-processing a table should be fine because we expect duplicate processing should be idemponent so should be logically a no-op. `DeleteTableInternal` is an exception when PITR is involved: the first call to `DeleteTableInternal` results in the table gets HIDDEN rather than DELETED when there is a PITR schedule to retain the table. the second call to `DeleteTableInternal` results in the table gets DELETED and PITR schedule no longer retain it. As a result, DELETED tables are not restored later when a PITR restore operation is performed. To work around the problem, I made an exception for `DeleteTableInternal` to detect duplicate calls. Jira: DB-12379 Original commit: e6c2ee0532ab3dbb5ff8727300397c274841760b / D37221 Test Plan: (1) ./yb_build.sh asan --cxx-test tools_yb-admin-snapshot-schedule-test --gtest_filter YbAdminSnapshotScheduleTest.SysCatalogRetention --clang17 -n 20 --tp 1 (2) ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/0 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/1 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/2 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/3 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/4 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/5 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/6 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/7 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/8 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/9 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/10 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/11 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/12 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/13 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/14 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/15 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/16 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/17 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/18 --clang17 -n 20 --tp 1 ./yb_build.sh asan --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest/PgDdlAtomicityStressTest.StressTest/19 --clang17 -n 20 --tp 1 Reviewers: hsunder, fizaa Reviewed By: hsunder Subscribers: yql, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D37275 --- src/yb/master/catalog_manager.h | 4 ++++ src/yb/master/ysql_ddl_handler.cc | 26 ++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/yb/master/catalog_manager.h b/src/yb/master/catalog_manager.h index 66d7f30c6fe4..5a01ff955a88 100644 --- a/src/yb/master/catalog_manager.h +++ b/src/yb/master/catalog_manager.h @@ -3265,7 +3265,11 @@ class CatalogManager : public tserver::TabletPeerLookupIf, YsqlDdlVerificationState state; // The table info objects of the tables affected by this transaction. + // The processed_tables is used to avoid duplicated processing. Currently it is the set of + // tables for which delete table has already been called. For PITR, we cannot make duplicate + // delete table calls. std::vector> tables; + std::unordered_set processed_tables; }; // This map stores the transaction ids of all the DDL transactions undergoing verification. diff --git a/src/yb/master/ysql_ddl_handler.cc b/src/yb/master/ysql_ddl_handler.cc index 4120485e9e9a..baf3b1fac1a4 100644 --- a/src/yb/master/ysql_ddl_handler.cc +++ b/src/yb/master/ysql_ddl_handler.cc @@ -127,7 +127,7 @@ bool CatalogManager::CreateOrUpdateDdlTxnVerificationState( ysql_ddl_txn_verfication_state_map_.emplace(txn.transaction_id, YsqlDdlTransactionState{TxnState::kUnknown, YsqlDdlVerificationState::kDdlInProgress, - {table}}); + {table}, {} /* processed_tables */}); return true; } @@ -189,6 +189,7 @@ Status CatalogManager::YsqlDdlTxnCompleteCallback(const string& pb_txn_id, << txn << " is_committed: " << (is_committed ? "true" : "false"); vector tables; + std::unordered_set processed_tables; { LockGuard lock(ddl_txn_verifier_mutex_); auto verifier_state = FindOrNull(ysql_ddl_txn_verfication_state_map_, txn); @@ -214,7 +215,8 @@ Status CatalogManager::YsqlDdlTxnCompleteCallback(const string& pb_txn_id, // t3 gets added, if we return Status::OK() here, then t3 will never be // processed. Therefore we need to call YsqlDdlTxnCompleteCallbackInternal // to process t3. It is fine to reprocess t1 and t2, that will result in a - // no-op. + // no-op, except for delete table operation for which we'll detect and avoid + // reprocessing. SCHECK_EQ(txn_state, verifier_state->txn_state, IllegalState, Format("Mismatch in txn_state for transaction $0", txn)); } else { @@ -222,10 +224,15 @@ Status CatalogManager::YsqlDdlTxnCompleteCallback(const string& pb_txn_id, verifier_state->state = YsqlDdlVerificationState::kDdlPostProcessing; } tables = verifier_state->tables; + processed_tables = verifier_state->processed_tables; } bool ddl_verification_success = true; for (auto& table : tables) { + if (processed_tables.contains(table->id())) { + VLOG(1) << "DDL already processed on table " << table->id(); + continue; + } // If the table is already involved in a new DDL transaction, then txn // has already completed. The table will be taken care of by the new // transaction. @@ -473,6 +480,21 @@ Status CatalogManager::YsqlDdlTxnDropTableHelper( if (RandomActWithProbability(FLAGS_TEST_ysql_ddl_rollback_failure_probability)) { return STATUS(InternalError, "Injected random failure for testing."); } + // Mark that we have called delete table on this table. + { + LockGuard lock(ddl_txn_verifier_mutex_); + auto verifier_state = FindOrNull(ysql_ddl_txn_verfication_state_map_, txn_data.ddl_txn_id); + if (!verifier_state) { + VLOG(3) << "Transaction " << txn_data.ddl_txn_id << " is already verified, ignoring"; + return Status::OK(); + } + if (verifier_state->processed_tables.contains(table->id())) { + VLOG(1) << "Delete table already called on table " << table->id() + << " in txn " << txn_data.ddl_txn_id; + return Status::OK(); + } + verifier_state->processed_tables.insert(table->id()); + } return DeleteTableInternal(&dtreq, &dtresp, nullptr /* rpc */, txn_data.epoch); }