Skip to content

Commit

Permalink
[BACKPORT 2.20.2] [PLAT-8632] Add runtime config for waitForServerRea…
Browse files Browse the repository at this point in the history
…dy timeout

Summary:
Add runtime config for waitForServerReady (default: 10 mins) called `yb.checks.wait_for_server_ready.timeout` instead of the static 10 min wait as some customers have huge number of tablets to locally bootstrap and 10 mins is not enough.

Original diff: https://phorge.dev.yugabyte.com/D29341

Test Plan:
Create a 3 node rf3 universe

1. Modify the `yb.checks.wait_for_server_ready.timeout` runtime config to a very low value.
2. Run a rolling restart.

The `waitForServerReady` subtask will fail on the first node because of the very low timeout.

Reviewers: sneelakantan

Reviewed By: sneelakantan

Subscribers: yugaware

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D36418
  • Loading branch information
charleswang234 authored and iSignal committed Jul 6, 2024
1 parent b40acf6 commit 5e022ce
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 11 deletions.
1 change: 1 addition & 0 deletions managed/RUNTIME-FLAGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
| "Enabling follower lag check" | "yb.checks.follower_lag.enabled" | "UNIVERSE" | "Controls whether or not to perform the follower lag checks" | "Boolean" |
| "Follower lag check timeout" | "yb.checks.follower_lag.timeout" | "UNIVERSE" | "Controls the max time out when performing follower lag checks" | "Duration" |
| "Max threshold for follower lag" | "yb.checks.follower_lag.max_threshold" | "UNIVERSE" | "The maximum time that we allow a tserver to be behind its peers" | "Duration" |
| "Wait for server ready timeout" | "yb.checks.wait_for_server_ready.timeout" | "UNIVERSE" | "Controls the max time for server to finish locally bootstrapping" | "Duration" |
| "Memory check timeout" | "yb.dbmem.checks.timeout" | "UNIVERSE" | "Timeout for memory check in secs" | "Long" |
| "Wait time before doing restore during xCluster setup task" | "yb.xcluster.sleep_time_before_restore" | "UNIVERSE" | "The amount of time to sleep (wait) before executing restore subtask during xCluster setup; it is useful because xCluster setup also drops the database before restore and the sleep makes sure the drop operation has reached all the nodes" | "Duration" |
| "Use server broadcast address for yb_backup" | "yb.backup.use_server_broadcast_address_for_yb_backup" | "UNIVERSE" | "Controls whether server_broadcast_address entry should be used during yb_backup.py backup/restore" | "Boolean" |
Expand Down
6 changes: 6 additions & 0 deletions managed/devops/pex/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Get the latest docker image
FROM quay.io/pypa/manylinux2014_x86_64


RUN sed -i 's/mirrorlist=/#mirrorlist=/g' /etc/yum.repos.d/CentOS-*
RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' \
/etc/yum.repos.d/CentOS-*


# Perform general yum updates
RUN yum --enablerepo=extras -y install epel-release python3-pip
RUN pip3 install --upgrade pip
Expand Down
6 changes: 3 additions & 3 deletions managed/devops/yb_release
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ activate_virtualenv
bin/install_ansible_requirements.sh --force

# Python modules build for yugabundle.
cd "$yb_devops_home"
docker build -t "$DOCKER_VENV_IMAGE_NAME" .
docker run -v "$yb_devops_home:/devops" -u "$UID:$(id -g $UID)" "$DOCKER_VENV_IMAGE_NAME"
# cd "$yb_devops_home"
# docker build -t "$DOCKER_VENV_IMAGE_NAME" .
# docker run -v "$yb_devops_home:/devops" -u "$UID:$(id -g $UID)" "$DOCKER_VENV_IMAGE_NAME"

# PEX virtual env build for other deployments.
cd "$yb_devops_home/pex"
Expand Down
1 change: 0 additions & 1 deletion managed/devops/yb_release_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"destroy-instance.yml",
"python_requirements_frozen.txt",
"python3_requirements_frozen.txt",
"python*_modules.tar.gz",
"preprovision.yml",
"use_custom_ssh_port.yml",
"yb-server-ctl.yml",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import com.yugabyte.yw.commissioner.BaseTaskDependencies;
import com.yugabyte.yw.commissioner.tasks.UniverseTaskBase.ServerType;
import com.yugabyte.yw.commissioner.tasks.params.ServerSubTaskParams;
import com.yugabyte.yw.common.config.UniverseConfKeys;
import com.yugabyte.yw.models.Universe;
import java.time.Duration;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;
Expand All @@ -33,9 +35,6 @@ public class WaitForServerReady extends ServerSubTaskBase {
// Log after these many iterations.
private static final int LOG_EVERY_NUM_ITERS = 100;

// Maximum total wait time for the rpc to return 0 not-running tablets (10min).
private static final int MAX_TOTAL_WAIT_MS = 600000;

@Inject
protected WaitForServerReady(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
Expand Down Expand Up @@ -68,24 +67,30 @@ private void sleepRemaining(long userWaitTimeMs, long timeElapsedMs) {
public void run() {
checkParams();

Stopwatch stopwatch = Stopwatch.createStarted();
int numIters = 0;
Duration userWaitTime = Duration.ofMillis(taskParams().waitTimeMs);
Duration maxTotalWaitTime = Duration.ofMillis(MAX_TOTAL_WAIT_MS);
HostAndPort hp = getHostPort();
boolean isMasterTask = taskParams().serverType == ServerType.MASTER;
IsServerReadyResponse response = null;
String errorMessage = null;
boolean shouldLog = false;

Universe universe = Universe.getOrBadRequest(taskParams().getUniverseUUID());
// Max timeout to wait for check to complete.
Duration maxSubtaskTimeout =
confGetter.getConfForScope(universe, UniverseConfKeys.waitForServerReadyTimeout);
Stopwatch stopwatch = Stopwatch.createStarted();

try (YBClient client = getClient()) {
while (true) {
shouldLog = (numIters % LOG_EVERY_NUM_ITERS) == 0;
if (stopwatch.elapsed().compareTo(maxTotalWaitTime) > 0) {
if (stopwatch.elapsed().compareTo(maxSubtaskTimeout) > 0) {
log.info("Timing out after iters={}. error '{}'.", numIters, errorMessage);
throw new RuntimeException(
String.format(
"WaitForServerReady, max number attempts reached: %s. Failing...", numIters));
"WaitForServerReady, timing out after retrying %d times for a duration of %dms,"
+ " greater than max time out of %dms. Failing...",
numIters, stopwatch.elapsed().toMillis(), maxSubtaskTimeout.toMillis()));
}
errorMessage = null;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,14 @@ public class UniverseConfKeys extends RuntimeConfigKeysModule {
"The maximum time that we allow a tserver to be behind its peers",
ConfDataType.DurationType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Duration> waitForServerReadyTimeout =
new ConfKeyInfo<>(
"yb.checks.wait_for_server_ready.timeout",
ScopeType.UNIVERSE,
"Wait for server ready timeout",
"Controls the max time for server to finish locally bootstrapping",
ConfDataType.DurationType,
ImmutableList.of(ConfKeyTags.PUBLIC));
public static final ConfKeyInfo<Long> checkMemoryTimeoutSecs =
new ConfKeyInfo<>(
"yb.dbmem.checks.timeout",
Expand Down
3 changes: 3 additions & 0 deletions managed/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,9 @@ yb {
enabled = true
max_threshold = 60000ms
}
wait_for_server_ready {
timeout: 10m
}
}

health {
Expand Down

0 comments on commit 5e022ce

Please sign in to comment.