Skip to content

Commit

Permalink
[#8229] backup: repartition table if needed on YSQL restore
Browse files Browse the repository at this point in the history
Summary:
Sometimes, ysql_dump may not provide enough partitioning hints.  Then,
when importing snapshots, an error may be thrown for a mismatch on
number of tablets between the snapshot table and ysql_dump-created
table.

Fix this by recreating partitions for the YSQL table in this case.  Do
not yet address the case where number of tablets are the same but
partitions are split differently.

In the implementation, do not worry about waiting for index tablets to
finish creating on tservers since that isn't done for YCQL, which
already recreates table, either.

Add test YBBackupTest.TestYSQLChangeDefaultNumTablets and disabled test
YBBackupTest.TestYSQLRangeSplitConstraint.

Also,

- add a VLOG for AsyncCreateReplica response
- fix AddTabletUnlocked thread annotation from REQUIRES_SHARED to
  REQUIRES
- fix ScopedTabletInfoCommitter to avoid double CommitMutation
- move ScopedTabletInfoCommitter from anonymous namespace to
  catalog_entity_info.h so that ent::CatalogManager can use it
- make ScopedTabletInfoCommitter a template class ScopedInfoCommitter so
  that it can later be used for other objects like TableInfo

Do not close issue #8229 since there will be a part 2 handling partition
schemas that differ on split points but have the same number of
partitions.

Test Plan:
    ./yb_build.sh --cxx-test tools_yb-backup-test_ent \
      --gtest_filter YBBackupTest.TestYSQLChangeDefaultNumTablets

Reviewers: nicolas, oleg

Reviewed By: oleg

Subscribers: ybase, bogdan

Differential Revision: https://phabricator.dev.yugabyte.com/D13122
  • Loading branch information
jaki committed Sep 30, 2021
1 parent 7986576 commit b14485a
Show file tree
Hide file tree
Showing 8 changed files with 385 additions and 70 deletions.
2 changes: 2 additions & 0 deletions ent/src/yb/master/catalog_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ class CatalogManager : public yb::master::CatalogManager, SnapshotCoordinatorCon
CHECKED_STATUS RecreateTable(const NamespaceId& new_namespace_id,
const ExternalTableSnapshotDataMap& table_map,
ExternalTableSnapshotData* table_data);
CHECKED_STATUS RepartitionTable(scoped_refptr<TableInfo> table,
const ExternalTableSnapshotData* table_data);
CHECKED_STATUS ImportTableEntry(const NamespaceMap& namespace_map,
const ExternalTableSnapshotDataMap& table_map,
ExternalTableSnapshotData* s_data);
Expand Down
155 changes: 149 additions & 6 deletions ent/src/yb/master/catalog_manager_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,141 @@ Status CatalogManager::RecreateTable(const NamespaceId& new_namespace_id,
return Status::OK();
}

Status CatalogManager::RepartitionTable(const scoped_refptr<TableInfo> table,
const ExternalTableSnapshotData* table_data) {
DCHECK_EQ(table->id(), table_data->new_table_id);
if (table->GetTableType() != PGSQL_TABLE_TYPE) {
return STATUS_FORMAT(InvalidArgument,
"Cannot repartition non-YSQL table: got $0",
TableType_Name(table->GetTableType()));
}
LOG_WITH_FUNC(INFO) << "Repartition table " << table->id()
<< " using external snapshot table " << table_data->old_table_id;

// Get partitions from external snapshot.
size_t i = 0;
vector<Partition> partitions(table_data->partitions.size());
for (const auto& partition_pb : table_data->partitions) {
Partition::FromPB(partition_pb, &partitions[i++]);
}
VLOG_WITH_FUNC(3) << "Got " << partitions.size()
<< " partitions from external snapshot for table " << table->id();

// Change TableInfo to point to the new tablets.
string deletion_msg;
vector<scoped_refptr<TabletInfo>> new_tablets;
vector<scoped_refptr<TabletInfo>> old_tablets;
{
// Acquire the TableInfo pb write lock. Although it is not required for some of the individual
// steps, we want to hold it through so that we guarantee the state does not change during the
// whole process. Consequently, we hold it through some steps that require mutex_, but since
// taking mutex_ after TableInfo pb lock is prohibited for deadlock reasons, acquire mutex_
// first, then release it when it is no longer needed, still holding table pb lock.
TableInfo::WriteLock table_lock;
{
LockGuard lock(mutex_);
TRACE("Acquired catalog manager lock");

// Make sure the table is in RUNNING state.
// This by itself doesn't need a pb write lock: just a read lock. However, we want to prevent
// other writers from entering from this point forward, so take the write lock now.
table_lock = table->LockForWrite();
if (table->old_pb().state() != SysTablesEntryPB::RUNNING) {
return STATUS_FORMAT(IllegalState,
"Table $0 not running: $1",
table->ToString(),
SysTablesEntryPB_State_Name(table->old_pb().state()));
}
// Make sure the table's tablets can be deleted.
RETURN_NOT_OK_PREPEND(CheckIfForbiddenToDeleteTabletOf(table),
Format("Cannot repartition table $0", table->id()));

// Create and mark new tablets for creation.

// Use partitions from external snapshot to create new tablets in state PREPARING. The tablets
// will start CREATING once they are committed in memory.
for (const auto& partition : partitions) {
PartitionPB partition_pb;
partition.ToPB(&partition_pb);
new_tablets.push_back(CreateTabletInfo(table.get(), partition_pb));
}

// Add tablets to catalog manager tablet_map_. This should be safe to do after creating
// tablets since we're still holding mutex_.
auto tablet_map_checkout = tablet_map_.CheckOut();
for (auto& new_tablet : new_tablets) {
InsertOrDie(tablet_map_checkout.get_ptr(), new_tablet->tablet_id(), new_tablet);
}
VLOG_WITH_FUNC(3) << "Prepared creation of " << new_tablets.size()
<< " new tablets for table " << table->id();

// mutex_ is no longer needed, so release by going out of scope.
}
// The table pb write lock is still held, ensuring that the table state does not change. Later
// steps, like GetTablets or AddTablets, will acquire/release the TableInfo lock_, but it's
// probably fine that they are released between the steps since the table pb write lock is held
// throughout. In other words, there should be no risk that TableInfo tablets_ changes between
// GetTablets and RemoveTablets.

// Abort tablet mutations in case of early returns.
ScopedInfoCommitter<TabletInfo> unlocker_new(&new_tablets);

// Mark old tablets for deletion.
old_tablets = table->GetTablets(IncludeInactive::kTrue);
// Sort so that locking can be done in a deterministic order.
std::sort(old_tablets.begin(), old_tablets.end(), [](const auto& lhs, const auto& rhs) {
return lhs->tablet_id() < rhs->tablet_id();
});
deletion_msg = Format("Old tablets of table $0 deleted at $1",
table->id(), LocalTimeAsString());
for (auto& old_tablet : old_tablets) {
old_tablet->mutable_metadata()->StartMutation();
old_tablet->mutable_metadata()->mutable_dirty()->set_state(
SysTabletsEntryPB::DELETED, deletion_msg);
}
VLOG_WITH_FUNC(3) << "Prepared deletion of " << old_tablets.size() << " old tablets for table "
<< table->id();

// Abort tablet mutations in case of early returns.
ScopedInfoCommitter<TabletInfo> unlocker_old(&old_tablets);

// Change table's partition schema to the external snapshot's.
table_lock.mutable_data()->pb.mutable_partition_schema()->CopyFrom(
table_data->table_entry_pb.partition_schema());

// Remove old tablets from TableInfo.
table->RemoveTablets(old_tablets);
// Add new tablets to TableInfo. This must be done after removing tablets because
// TableInfo::partitions_ has key PartitionKey, which old and new tablets may conflict on.
table->AddTablets(new_tablets);

// Commit table and tablets to disk.
RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table, new_tablets, old_tablets));
VLOG_WITH_FUNC(2) << "Committed to disk: table " << table->id() << " repartition from "
<< old_tablets.size() << " tablets to " << new_tablets.size() << " tablets";

// Commit to memory. Commit new tablets (addition) first since that doesn't break anything.
// Commit table next since new tablets are already committed and ready to be referenced. Commit
// old tablets (deletion) last since the table is not referencing them anymore.
unlocker_new.Commit();
table_lock.Commit();
unlocker_old.Commit();
VLOG_WITH_FUNC(1) << "Committed to memory: table " << table->id() << " repartition from "
<< old_tablets.size() << " tablets to " << new_tablets.size() << " tablets";
}

// Finally, now that everything is committed, send the delete tablet requests.
for (auto& old_tablet : old_tablets) {
DeleteTabletReplicas(old_tablet.get(), deletion_msg, HideOnly::kFalse);
}
VLOG_WITH_FUNC(2) << "Sent delete tablet requests for " << old_tablets.size() << " old tablets"
<< " of table " << table->id();
// The create tablet requests should be handled by bg tasks which find the PREPARING tablets after
// commit.

return Status::OK();
}

// Helper function for ImportTableEntry.
//
// Given an internal table and an external table snapshot, do some checks to determine if we should
Expand Down Expand Up @@ -1398,12 +1533,20 @@ Status CatalogManager::ImportTableEntry(const NamespaceMap& namespace_map,
}

if (table_data->num_tablets > 0 && new_num_tablets != table_data->num_tablets) {
const string msg = Format(
"Wrong number of tablets in created $0 table '$1' in namespace id $2: $3 (expected $4)",
TableType_Name(meta.table_type()), meta.name(), new_namespace_id,
new_num_tablets, table_data->num_tablets);
LOG_WITH_FUNC(WARNING) << msg;
return STATUS(InternalError, msg, MasterError(MasterErrorPB::SNAPSHOT_FAILED));
// TODO(#8229): Also recreate tables besides num tablets mismatching since it is possible for
// splits to be off while num tablets matches. For example, [start, 0x777f), [0x777f, end] vs
// [start, 0x8000), [0x8000, end]. That particular example may not occur in practice, but the
// issue should show up when restoring tablets that have been dynamically split.
if (meta.table_type() == TableType::PGSQL_TABLE_TYPE) {
RETURN_NOT_OK(RepartitionTable(table, table_data));
} else {
const string msg = Format(
"Wrong number of tablets in created $0 table '$1' in namespace id $2: $3 (expected $4)",
TableType_Name(meta.table_type()), meta.name(), new_namespace_id,
new_num_tablets, table_data->num_tablets);
LOG_WITH_FUNC(WARNING) << msg;
return STATUS(InternalError, msg, MasterError(MasterErrorPB::SNAPSHOT_FAILED));
}
}

// Update the table column ids if it's not equal to the stored ids.
Expand Down
150 changes: 150 additions & 0 deletions ent/src/yb/tools/yb-backup-test_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,156 @@ TEST_F(YBBackupTest, YB_DISABLE_TEST_IN_SANITIZERS_OR_MAC(TestYCQLBackupWithDefi
}
}

// Test backup/restore on table with UNIQUE constraint where the unique constraint is originally
// range partitioned to multiple tablets. When creating the constraint, split to 3 tablets at custom
// split points. When restoring, ysql_dump is not able to express the splits, so it will create the
// constraint as 1 hash tablet. Restore should restore the unique constraint index as 3 tablets
// since the tablet snapshot files are already split into 3 tablets.
//
// TODO(jason): enable test when issue #4873 ([YSQL] Support backup for pre-split multi-tablet range
// tables) is fixed.
TEST_F(YBBackupTest, YB_DISABLE_TEST(TestYSQLRangeSplitConstraint)) {
const string table_name = "mytbl";
const string index_name = "myidx";

// Create table with unique constraint where the unique constraint is custom range partitioned.
ASSERT_NO_FATALS(CreateTable(
Format("CREATE TABLE $0 (k SERIAL PRIMARY KEY, v TEXT)", table_name)));
ASSERT_NO_FATALS(CreateIndex(
Format("CREATE UNIQUE INDEX $0 ON $1 (v ASC) SPLIT AT VALUES (('foo'), ('qux'))",
index_name, table_name)));
ASSERT_NO_FATALS(RunPsqlCommand(
Format("ALTER TABLE $0 ADD UNIQUE USING INDEX $1", table_name, index_name),
"ALTER TABLE"));

// Write data in each partition of the index.
ASSERT_NO_FATALS(InsertRows(
Format("INSERT INTO $0 (v) VALUES ('bar'), ('jar'), ('tar')", table_name), 3));
ASSERT_NO_FATALS(RunPsqlCommand(
Format("SELECT * FROM $0 ORDER BY k", table_name),
R"#(
k | v
---+-----
1 | bar
2 | jar
3 | tar
(3 rows)
)#"
));

// Backup.
const string backup_dir = GetTempDir("backup");
ASSERT_OK(RunBackupCommand(
{"--backup_location", backup_dir, "--keyspace", "ysql.yugabyte", "create"}));

// Drop the table (and index) so that, on restore, running the ysql_dump file recreates the table
// (and index).
ASSERT_NO_FATALS(RunPsqlCommand(Format("DROP TABLE $0", table_name), "DROP TABLE"));

// Restore should notice that the index it creates from ysql_dump file (1 tablet) differs from
// the external snapshot (3 tablets), so it should adjust to match the snapshot (3 tablets).
ASSERT_OK(RunBackupCommand({"--backup_location", backup_dir, "restore"}));

// Verify data.
ASSERT_NO_FATALS(RunPsqlCommand(
Format("SELECT * FROM $0 ORDER BY k", table_name),
R"#(
k | v
---+-----
1 | bar
2 | jar
3 | tar
(3 rows)
)#"
));

LOG(INFO) << "Test finished: " << CURRENT_TEST_CASE_AND_TEST_NAME_STR();
}

class YBBackupTestNumTablets : public YBBackupTest {
public:
void UpdateMiniClusterOptions(ExternalMiniClusterOptions* options) override {
YBBackupTest::UpdateMiniClusterOptions(options);

options->extra_tserver_flags.push_back("--ycql_num_tablets=3");
options->extra_tserver_flags.push_back("--ysql_num_tablets=3");
}
};

// Test backup/restore on table with UNIQUE constraint when default number of tablets differs. When
// creating the table, the default is 3; when restoring, the default is 2. Restore should restore
// the unique constraint index as 3 tablets since the tablet snapshot files are already split into 3
// tablets.
//
// For debugging, run ./yb_build.sh with extra flags:
// - --extra-daemon-flags "--vmodule=client=1,table_creator=1"
// - --test-args "--verbose_yb_backup"
TEST_F_EX(YBBackupTest,
YB_DISABLE_TEST_IN_SANITIZERS_OR_MAC(TestYSQLChangeDefaultNumTablets),
YBBackupTestNumTablets) {
const string table_name = "mytbl";
const string index_name = table_name + "_v_key";

ASSERT_NO_FATALS(CreateTable(Format(
"CREATE TABLE $0 (k INT PRIMARY KEY, v TEXT, UNIQUE (v))", table_name)));

LOG(INFO) << "pre-backup: get table";
std::vector<client::YBTableName> tables = ASSERT_RESULT(client_->ListTables(index_name));
ASSERT_EQ(tables.size(), 1);
string table_id = tables.front().table_id();

LOG(INFO) << "pre-backup: get tablets";
google::protobuf::RepeatedPtrField<yb::master::TabletLocationsPB> tablets;
ASSERT_OK(client_->GetTabletsFromTableId(table_id, -1, &tablets));
ASSERT_EQ(tablets.size(), 3);

const string backup_dir = GetTempDir("backup");
ASSERT_OK(RunBackupCommand(
{"--backup_location", backup_dir, "--keyspace", "ysql.yugabyte", "create"}));

// Drop the table (and index) so that, on restore, running the ysql_dump file recreates the table
// (and index).
ASSERT_NO_FATALS(RunPsqlCommand(Format("DROP TABLE $0", table_name), "DROP TABLE"));

// When restore runs the CREATE TABLE, make it run in an environment where the default number of
// tablets is different. Namely, run it with new default 2 (previously 3). This won't affect the
// table since the table is generated with SPLIT clause specifying 3, but it will change the way
// the unique index is created because the unique index has no corresponding grammar to specify
// number of splits in ysql_dump file.
for (auto ts : cluster_->tserver_daemons()) {
ts->Shutdown();
ts->mutable_flags()->push_back("--ysql_num_tablets=2");
ASSERT_OK(ts->Restart());
}

// Check that --ysql_num_tablets=2 is working as intended by
// 1. running the CREATE TABLE that is expected to be found in the ysql_dump file and
// 2. finding 2 index tablets
ASSERT_NO_FATALS(CreateTable(Format(
"CREATE TABLE $0 (k INT PRIMARY KEY, v TEXT, UNIQUE (v))", table_name)));
tables = ASSERT_RESULT(client_->ListTables(index_name));
ASSERT_EQ(tables.size(), 1);
table_id = tables.front().table_id();
ASSERT_OK(client_->GetTabletsFromTableId(table_id, -1, &tablets));
ASSERT_EQ(tablets.size(), 2);
ASSERT_NO_FATALS(RunPsqlCommand(Format("DROP TABLE $0", table_name), "DROP TABLE"));

// Restore should notice that the index it creates from ysql_dump file (2 tablets) differs from
// the external snapshot (3 tablets), so it should adjust to match the snapshot (3 tablets).
ASSERT_OK(RunBackupCommand({"--backup_location", backup_dir, "restore"}));

LOG(INFO) << "post-restore: get table";
tables = ASSERT_RESULT(client_->ListTables(index_name));
ASSERT_EQ(tables.size(), 1);
table_id = tables.front().table_id();

LOG(INFO) << "post-restore: get tablets";
ASSERT_OK(client_->GetTabletsFromTableId(table_id, -1, &tablets));
ASSERT_EQ(tablets.size(), 3);

LOG(INFO) << "Test finished: " << CURRENT_TEST_CASE_AND_TEST_NAME_STR();
}

class YBFailSnapshotTest: public YBBackupTest {
void SetUp() override {
YBBackupTest::SetUp();
Expand Down
1 change: 1 addition & 0 deletions src/yb/master/async_rpc_tasks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ void AsyncCreateReplica::HandleResponse(int attempt) {
}

TransitionToCompleteState();
VLOG_WITH_PREFIX(1) << "TS " << permanent_uuid_ << ": complete on tablet " << tablet_id_;
}

bool AsyncCreateReplica::SendRequest(int attempt) {
Expand Down
Loading

0 comments on commit b14485a

Please sign in to comment.