Skip to content

Commit

Permalink
Fix multi slaves full sync condition race
Browse files Browse the repository at this point in the history
  • Loading branch information
ShooterIT committed Mar 20, 2021
1 parent a0c6b18 commit f3855a3
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 6 deletions.
10 changes: 7 additions & 3 deletions src/storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ Status Storage::DecrDBRefs() {

Status Storage::ReplDataManager::GetFullReplDataInfo(Storage *storage, std::string *files) {
std::string data_files_dir = storage->config_->checkpoint_dir;
std::unique_lock<std::mutex> ulm(storage->checkpoint_mu_);

// Create checkpoint if not exist
if (!storage->backup_env_->FileExists(data_files_dir).ok()) {
Expand All @@ -607,20 +608,23 @@ Status Storage::ReplDataManager::GetFullReplDataInfo(Storage *storage, std::stri
storage->SetCheckpointAccessTime(std::time(nullptr));
storage->SetCreatingCheckpoint(false);
if (!s.ok()) {
LOG(WARNING) << "Fail to create checkpoint, error:" << s.ToString();
LOG(WARNING) << "[storage] Fail to create checkpoint, error:" << s.ToString();
return Status(Status::NotOK, s.ToString());
}
LOG(INFO) << "[storage] Create checkpoint successfully";
} else {
// Replicas can share checkpiont to replication if the checkpoint existing
// time is less half of WAL ttl.
int64_t can_shared_time = storage->config_->RocksDB.WAL_ttl_seconds / 2;
if (can_shared_time > 60 * 60) can_shared_time = 60 * 60;
if (can_shared_time < 10 * 60) can_shared_time = 10 * 60;
if (std::time(nullptr) - storage->GetCheckpointCreateTime() > can_shared_time) {
LOG(WARNING) << "Can't use current checkpoint, waiting next checkpoint";
return Status(Status::NotOK, "Can't use current checkpoint, waiting next checkpoint");
LOG(WARNING) << "[storage] Can't use current checkpoint, waiting next checkpoint";
return Status(Status::NotOK, "Can't use current checkpoint, waiting for next checkpoint");
}
LOG(INFO) << "[storage] Use current existing checkpoint";
}
ulm.unlock();

// Get checkpoint file list
std::vector<std::string> result;
Expand Down
1 change: 1 addition & 0 deletions src/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class Storage {
std::shared_ptr<rocksdb::SstFileManager> sst_file_manager_;
std::shared_ptr<rocksdb::RateLimiter> rate_limiter_;
ReplDataManager::CheckpointInfo checkpoint_info_;
std::mutex checkpoint_mu_;
Config *config_ = nullptr;
std::vector<rocksdb::ColumnFamilyHandle *> cf_handles_;
LockManager lock_mgr_;
Expand Down
38 changes: 35 additions & 3 deletions tests/tcl/tests/integration/replication.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ start_server {tags {"repl"}} {
start_server {} {
test {Second server should have role master at first} {
# Can't statify partial replication
for {set i 0} {$i < 1000} {incr i} {
r set $i $i
}
populate 100 "" 10
s role
} {master}

Expand Down Expand Up @@ -94,3 +92,37 @@ start_server {tags {"repl"}} {
}
}
}

start_server {tags {"repl"}} {
set A [srv 0 client]
populate 100 "" 10

start_server {} {
set B [srv 0 client]
populate 100 "" 10

start_server {} {
set C [srv 0 client]
set C_host [srv 0 host]
set C_port [srv 0 port]
populate 50 "" 10

test {Multi slaves full sync with master at the same time} {
$A slaveof $C_host $C_port
$B slaveof $C_host $C_port

# Wait for finishing full replication
wait_for_condition 500 100 {
[string match {*connected*} [$A role]] &&
[string match {*connected*} [$B role]]
} else {
fail "Slaves can't sync with master"
}
after 50000
# Only 2 full sync
assert_equal 2 [s sync_full]
}
}

}
}

0 comments on commit f3855a3

Please sign in to comment.