Skip to content

Commit

Permalink
[#21789] docdb: Add tablet splitting support for clone
Browse files Browse the repository at this point in the history
Summary:
This diff adds tablet splitting support for clone. Specifically:
- `CatalogManager::GetBackupEntriesAsOfTime` now returns the set of active tablets (tablets that have not been split) instead of RUNNING and not-hidden tablets. Previously, we could return both the split parent and children until the split parent is deleted. This was causing a `ImportSnapshot` to fail because we were trying to create two tablets with the same partition start key (we do not have the split depth to disambiguate in `ImportSnapshot`).
  - This could fail to cover the space if just one tablet is registered. This gap will be fixed by D31428 (until then, we cannot clone to the time when only one tablet is registered)
- In `CloneStateManager::ScheduleCloneOps`, changed the check before cloning from active rocksdb from "tablet is hidden" to "tablet is hidden or split".
  - This covers the case where a split happens at time t1, and we clone to before t1 using a snapshot taken after the parent is hidden (so the snapshot does not contain the split parent)
Jira: DB-10680

Test Plan:
```
./yb_build.sh release --cxx-test integration-tests_minicluster-snapshot-test --gtest_filter PgCloneTest.TabletSplitting
./yb_build.sh release --cxx-test integration-tests_minicluster-snapshot-test --gtest_filter PgCloneTest.TabletSplittingWithIndex
```

Reviewers: mhaddad

Reviewed By: mhaddad

Subscribers: ybase

Differential Revision: https://phorge.dev.yugabyte.com/D36440
  • Loading branch information
SrivastavaAnubhav committed Jul 16, 2024
1 parent 835e30d commit 3273e9b
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 14 deletions.
128 changes: 124 additions & 4 deletions src/yb/integration-tests/minicluster-snapshot-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
#include "yb/yql/pgwrapper/pg_wrapper.h"
#include "yb/yql/pgwrapper/pg_mini_test_base.h"

DECLARE_int32(cleanup_split_tablets_interval_sec);
DECLARE_bool(enable_db_clone);
DECLARE_bool(master_auto_run_initdb);
DECLARE_int32(pgsql_proxy_webserver_port);
Expand All @@ -83,6 +84,7 @@ DECLARE_string(ysql_hba_conf_csv);
DECLARE_bool(TEST_fail_clone_pg_schema);
DECLARE_bool(TEST_fail_clone_tablets);
DECLARE_string(TEST_mini_cluster_pg_host_port);
DECLARE_bool(TEST_skip_deleting_split_tablets);

namespace yb {
namespace master {
Expand Down Expand Up @@ -544,16 +546,19 @@ TEST_P(MasterExportSnapshotTest, ExportSnapshotAsOfTimeWithHiddenTables) {
class PgCloneTest : public PostgresMiniClusterTest {
protected:
void SetUp() override {
ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 1;
PostgresMiniClusterTest::SetUp();
ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_db_clone) = true;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_mini_cluster_pg_host_port) = pg_host_port().ToString();
ASSERT_OK(CreateMasterBackupProxy());
ASSERT_OK(CreateProxies());
ASSERT_OK(CreateSourceDbAndSnapshotSchedule());
}

Status CreateMasterBackupProxy() {
Status CreateProxies() {
messenger_ = VERIFY_RESULT(rpc::MessengerBuilder("test-msgr").set_num_reactors(1).Build());
proxy_cache_ = std::make_unique<rpc::ProxyCache>(messenger_.get());
master_admin_proxy_ = std::make_unique<MasterAdminProxy>(
proxy_cache_.get(), mini_cluster()->mini_master()->bound_rpc_addr());
master_backup_proxy_ = std::make_shared<MasterBackupProxy>(
proxy_cache_.get(), mini_cluster()->mini_master()->bound_rpc_addr());
return Status::OK();
Expand All @@ -569,18 +574,40 @@ class PgCloneTest : public PostgresMiniClusterTest {
kTimeout));
RETURN_NOT_OK(WaitScheduleSnapshot(master_backup_proxy_.get(), schedule_id, kTimeout));
RETURN_NOT_OK(source_conn_->Execute("CREATE TABLE t1 (key INT PRIMARY KEY, value INT)"));
return Status::OK();
return Status::OK();
}

void DoTearDown() override {
messenger_->Shutdown();
PostgresMiniClusterTest::DoTearDown();
}

std::shared_ptr<master::MasterBackupProxy> master_backup_proxy() { return master_backup_proxy_; }
Result<TableInfoPtr> GetTable(const std::string& table_name, const std::string& db_name) {
auto leader_master = VERIFY_RESULT(cluster_->GetLeaderMiniMaster());
for (const auto& table : leader_master->catalog_manager_impl().GetTables(GetTablesMode::kAll)) {
if (table->name() == table_name && table->namespace_name() == db_name) {
return table;
}
}
return STATUS_FORMAT(NotFound, "Table $0 not found", table_name);
}

Status SplitTablet(const TabletId& tablet_id) {
SplitTabletRequestPB req;
SplitTabletResponsePB resp;
rpc::RpcController controller;
controller.set_timeout(30s);
req.set_tablet_id(tablet_id);
RETURN_NOT_OK(master_admin_proxy_->SplitTablet(req, &resp, &controller));
SCHECK_FORMAT(
!resp.has_error(), InternalError, "SplitTablet RPC failed. Error: $0",
resp.error().ShortDebugString());
return Status::OK();
}

std::unique_ptr<rpc::Messenger> messenger_;
std::unique_ptr<rpc::ProxyCache> proxy_cache_;
std::shared_ptr<MasterAdminProxy> master_admin_proxy_;
std::shared_ptr<MasterBackupProxy> master_backup_proxy_;
std::unique_ptr<pgwrapper::PGConn> source_conn_;

Expand Down Expand Up @@ -700,6 +727,99 @@ TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(CloneAfterDropTable)) {
ASSERT_EQ(row, kRows[0]);
}

TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(TabletSplitting)) {
const int kNumRows = 1000;

// Test that we are able to clone to:
// 1. Before the split occurs on the master (when the children are upserted into the sys catalog).
// 2. After the split occurs on the master server but before the parent is hidden.
// 3. After the split parent is hidden.
auto clone_and_validate = [&]
(const std::string& target_namespace, int64_t timestamp, int expected_num_tablets) -> Status {
RETURN_NOT_OK(source_conn_->ExecuteFormat(
"CREATE DATABASE $0 TEMPLATE $1 AS OF $2", target_namespace, kSourceNamespaceName,
timestamp));
auto target_conn = VERIFY_RESULT(ConnectToDB(target_namespace));
auto rows = VERIFY_RESULT((target_conn.FetchRows<int32_t, int32_t>("SELECT * FROM t1")));
SCHECK_EQ(rows.size(), kNumRows, IllegalState, "Number of rows mismatch");
auto table = VERIFY_RESULT(GetTable("t1", target_namespace));
SCHECK_EQ(
VERIFY_RESULT(table->GetTablets()).size(), expected_num_tablets, IllegalState,
"Number of tablets mismatch");
return Status::OK();
};

// Do not clean up split tablets for now.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = true;

// Write enough data for a middle key so tablet splitting succeeds.
ASSERT_OK(source_conn_->ExecuteFormat(
"INSERT INTO t1 VALUES (generate_series(1, $0), generate_series(1, $0))", kNumRows));
ASSERT_OK(cluster_->FlushTablets());

TableInfoPtr source_table = ASSERT_RESULT(GetTable("t1", kSourceNamespaceName));
auto tablets = ASSERT_RESULT(source_table->GetTablets());
ASSERT_EQ(tablets.size(), 3);
auto before_split_timestamp = ASSERT_RESULT(GetCurrentTime()).ToInt64();

auto split_tablet_id = tablets[0]->tablet_id();
ASSERT_OK(SplitTablet(split_tablet_id));

// Wait for the split to complete on master.
// The parent should still be running because we have cleanup is still disabled.
ASSERT_OK(WaitFor([&]() -> Result<bool> {
return VERIFY_RESULT(source_table->GetTablets(IncludeInactive::kTrue)).size() == 5;
}, 30s, "Wait for master split."));
auto after_master_split_timestamp = ASSERT_RESULT(GetCurrentTime()).ToInt64();

// We should have 3 tablets before the master side split, and 4 after.
ASSERT_OK(clone_and_validate(kTargetNamespaceName1, before_split_timestamp, 3));
ASSERT_OK(clone_and_validate(kTargetNamespaceName2, after_master_split_timestamp, 4));

// Enable cleanup of split parents and wait for the split parent to be deleted.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = false;
ASSERT_OK(WaitFor([&]() -> Result<bool> {
auto tablets = VERIFY_RESULT(source_table->GetTablets(IncludeInactive::kTrue));
for (auto& tablet : tablets) {
if (tablet->id() == split_tablet_id) {
return tablet->LockForRead()->is_hidden();
}
}
return STATUS_FORMAT(NotFound, "Split parent tablet $0 not found", split_tablet_id);
}, 30s, "Wait for split parent to be hidden."));
auto parent_hidden_timestamp = ASSERT_RESULT(GetCurrentTime()).ToInt64();

// Clone to after the split parent was hidden. We should have 4 child tablets.
ASSERT_OK(clone_and_validate("testdb_clone3", parent_hidden_timestamp, 4));
}

TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(TabletSplittingWithIndex)) {
// Test that we can clone after splitting an index.
// Write enough data for a middle key so tablet splitting succeeds.
const int kNumRows = 1000;
ASSERT_OK(source_conn_->Execute("CREATE INDEX i1 ON t1(value)"));
ASSERT_OK(source_conn_->ExecuteFormat(
"INSERT INTO t1 VALUES (generate_series(1, $0), generate_series(1, $0))", kNumRows));
ASSERT_OK(cluster_->FlushTablets());

// Split an index tablet.
TableInfoPtr source_index = ASSERT_RESULT(GetTable("i1", kSourceNamespaceName));
auto tablets = ASSERT_RESULT(source_index->GetTablets());
ASSERT_EQ(tablets.size(), 3);
auto split_tablet_id = tablets[0]->tablet_id();
ASSERT_OK(SplitTablet(split_tablet_id));

// Wait for split to complete.
ASSERT_OK(WaitFor([&]() -> Result<bool> {
return VERIFY_RESULT(source_index->GetTablets()).size() == 4;
}, 30s, "Wait for split to complete."));

// Clone.
ASSERT_OK(source_conn_->ExecuteFormat(
"CREATE DATABASE $0 TEMPLATE $1", kTargetNamespaceName1, kSourceNamespaceName));
ASSERT_RESULT(GetTable("i1", kTargetNamespaceName1));
}

TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(UserIsSet)) {
// Test that the user is set to the user running the clone operation.
ASSERT_OK(source_conn_->Execute("CREATE ROLE test_user WITH LOGIN PASSWORD 'test'"));
Expand Down
8 changes: 2 additions & 6 deletions src/yb/master/catalog_manager_ext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1489,19 +1489,15 @@ Result<RepeatedPtrField<BackupRowEntryPB>> CatalogManager::GetBackupEntriesAsOfT
return Status::OK();
}));

// Pass 3: Get all the SysTabletsEntry that are in a running state as of read_time and belongs to
// the running tables from pass 2.
// Pass 3: Get all active (not split) tablets that belong to the tables from pass 2.
docdb::DocRowwiseIterator tablets_iter = docdb::DocRowwiseIterator(
projection, doc_read_cntxt, TransactionOperationContext(), doc_db,
docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op);
RETURN_NOT_OK(EnumerateSysCatalog(
&tablets_iter, doc_read_cntxt.schema(), SysRowEntryType::TABLET,
[&tables_to_tablets](const Slice& id, const Slice& data) -> Status {
auto pb = VERIFY_RESULT(pb_util::ParseFromSlice<SysTabletsEntryPB>(data));
// TODO(Yamen): handle tablet splitting cases by either keeping the parent or the children
// according to their state.
if (tables_to_tablets.contains(pb.table_id()) && pb.state() == SysTabletsEntryPB::RUNNING &&
pb.hide_hybrid_time() == 0) {
if (tables_to_tablets.contains(pb.table_id()) && pb.split_tablet_ids_size() == 0) {
VLOG_WITH_FUNC(1) << "Found SysTabletsEntryPB: " << pb.ShortDebugString();
tables_to_tablets[pb.table_id()].tablets_entries.push_back(
std::make_pair(id.ToBuffer(), pb));
Expand Down
8 changes: 6 additions & 2 deletions src/yb/master/clone/clone_state_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -495,9 +495,13 @@ Status CloneStateManager::ScheduleCloneOps(
const auto& clone_pb_lock = clone_state->LockForRead();
tablet::CloneTabletRequestPB req;
if (not_snapshotted_tablets.contains(tablet_data.source_tablet_id)) {
RSTATUS_DCHECK(source_tablet->LockForRead()->pb.hide_hybrid_time() != 0, IllegalState,
Format("Expected not snapshotted tablet to be in HIDDEN state. Actual: $0",
auto lock = source_tablet->LockForRead();
RSTATUS_DCHECK(lock->is_hidden() || lock->pb.split_tablet_ids_size() != 0, IllegalState,
Format("Expected not snapshotted tablet to be hidden or split state. Actual state: $0",
source_table_lock->state_name()));
VLOG(1) << Format(
"Cloning tablet $0 from active rocksdb since it was deleted or split before snapshot",
tablet_data.source_tablet_id);
req.set_clone_from_active_rocksdb(true);
}
req.set_tablet_id(tablet_data.source_tablet_id);
Expand Down
2 changes: 1 addition & 1 deletion src/yb/master/table_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ TableIndex::TablesRange TableIndex::GetAllTables() const {
}

TableIndex::TablesRange TableIndex::GetPrimaryTables() const {
return tables_.get<ColocatedUserTableTag>().equal_range(false);
return tables_.get<ColocatedUserTableTag>().equal_range(false /* is_colocated_user_table */);
}

void TableIndex::Clear() {
Expand Down
2 changes: 1 addition & 1 deletion src/yb/tserver/ts_tablet_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ DEFINE_UNKNOWN_int32(tablet_start_warn_threshold_ms, 500,
"a warning with a trace.");
TAG_FLAG(tablet_start_warn_threshold_ms, hidden);

DEFINE_UNKNOWN_int32(cleanup_split_tablets_interval_sec, 60,
DEFINE_NON_RUNTIME_int32(cleanup_split_tablets_interval_sec, 60,
"Interval at which tablet manager tries to cleanup split tablets which are no longer "
"needed. Setting this to 0 disables cleanup of split tablets.");

Expand Down

0 comments on commit 3273e9b

Please sign in to comment.