Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fail live probe if fail-fast mechanism was triggered #145

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/main/java/com/ibm/watson/modelmesh/ModelMesh.java
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,14 @@ boolean isLeader() {
return le != null && le.isLeader();
}

@Override
protected boolean isLive() {
abrbird marked this conversation as resolved.
Show resolved Hide resolved
if (failFastUpgradeEnabled && failLiveOnFailFastEnabled) {
return btspSuccessCount != null || !abortStartup;
}
return true;
}

/*
* We don't begin to return READY until no other members of the same logical
* model-mesh deployment are in a terminating state. We can still receive
Expand Down Expand Up @@ -1332,6 +1340,8 @@ protected boolean isReady() {

/* -------------------------- "fail-fast" startup probation period feature -------------------- */

protected volatile boolean failFastUpgradeEnabled;
protected volatile boolean failLiveOnFailFastEnabled;
protected volatile boolean abortStartup; // flag used to abort startup in case of unexpected model loading failures
protected AtomicInteger btspSuccessCount; // count of all succeeded load while bootstrap
protected AtomicInteger btspFatalCount; // count of all fatal failures load while bootstrap
Expand All @@ -1343,9 +1353,9 @@ protected boolean isReady() {
BOOTSTRAP_CLEARANCE_PERIOD_MS = Long.parseLong(btspClearanceStr);
}

boolean failfastUpgradeEnabled = !"false".equalsIgnoreCase(
System.getenv(FAILFAST_UPGRADE_ENV_VAR));
if (failfastUpgradeEnabled) {
failFastUpgradeEnabled = !"false".equalsIgnoreCase(System.getenv(FAILFAST_UPGRADE_ENV_VAR));
failLiveOnFailFastEnabled = "true".equalsIgnoreCase(System.getenv(FAIL_LIVE_ON_FAILFAST_ENV_VAR));
if (failFastUpgradeEnabled) {
btspSuccessCount = new AtomicInteger();
btspFatalCount = new AtomicInteger();
btspFailureCount = new AtomicInteger();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ private ModelMeshEnvVars() {}

public static final String BOOTSTRAP_CLEARANCE_PERIOD_ENV_VAR = "BOOTSTRAP_CLEARANCE_PERIOD_MS";
public static final String FAILFAST_UPGRADE_ENV_VAR = "MM_FAILFAST_UPGRADE_ENABLED";
public static final String FAIL_LIVE_ON_FAILFAST_ENV_VAR = "MM_FAIL_LIVE_ON_FAILFAST_ENABLED";

public static final String GRPC_MAX_CONNECTION_AGE_SECS_ENV_VAR = "MM_SVC_GRPC_MAX_CONNECTION_AGE_SECS";
public static final String GRPC_MAX_CONNECTION_AGE_GRACE_SECS_ENV_VAR = "MM_SVC_GRPC_MAX_CONNECTION_AGE_GRACE_SECS";
Expand Down