Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Move open job failure explanation out of root cause #31925

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -678,10 +678,7 @@ public void validate(OpenJobAction.JobParams params, ClusterState clusterState)
PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(), clusterState,
maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs, maxMachineMemoryPercent, logger);
if (assignment.getExecutorNode() == null) {
String msg = "Could not open job because no suitable nodes were found, allocation explanation ["
+ assignment.getExplanation() + "]";
logger.warn("[{}] {}", params.getJobId(), msg);
throw new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);
throw makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
}
}

Expand Down Expand Up @@ -785,9 +782,9 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
// and this is why this class must only be used when opening a job
if (assignment != null && assignment.equals(PersistentTasksCustomMetaData.INITIAL_ASSIGNMENT) == false &&
assignment.isAssigned() == false) {
OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
// Assignment has failed on the master node despite passing our "fast fail" validation
exception = new ElasticsearchStatusException("Could not open job because no suitable nodes were found, " +
"allocation explanation [" + assignment.getExplanation() + "]", RestStatus.TOO_MANY_REQUESTS);
exception = makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
// The persistent task should be cancelled so that the observed outcome is the
// same as if the "fast fail" validation on the coordinating node had failed
shouldCancel = true;
Expand All @@ -813,4 +810,12 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
}
}
}

static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
String msg = "Could not open job because no suitable nodes were found, allocation explanation [" + explanation + "]";
logger.warn("[{}] {}", jobId, msg);
Exception detail = new IllegalStateException(msg);
return new ElasticsearchStatusException("Could not open job because no ML nodes with sufficient capacity were found",
RestStatus.TOO_MANY_REQUESTS, detail);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,14 @@ public void testMlIndicesNotAvailable() throws Exception {

Exception e = expectThrows(ElasticsearchStatusException.class,
() -> client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet());
assertTrue(e.getMessage(),
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertTrue(e.getMessage(), e.getMessage().endsWith("because not all primary shards are active for the following indices "
+ "[.ml-state,.ml-anomalies-shared]]"));
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
IllegalStateException detail = (IllegalStateException) e.getCause();
assertNotNull(detail);
String detailedMessage = detail.getMessage();
assertTrue(detailedMessage,
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertTrue(detailedMessage, detailedMessage.endsWith("because not all primary shards are active for the following indices " +
"[.ml-state,.ml-anomalies-shared]]"));

logger.info("Start data node");
String nonMlNode = internalCluster().startNode(Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,22 @@ private void verifyMaxNumberOfJobsLimit(int numNodes, int maxNumberOfJobsPerNode
});
logger.info("Opened {}th job", i);
} catch (ElasticsearchStatusException e) {
assertTrue(e.getMessage(),
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
IllegalStateException detail = (IllegalStateException) e.getCause();
assertNotNull(detail);
String detailedMessage = detail.getMessage();
assertTrue(detailedMessage,
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
if (expectMemoryLimitBeforeCountLimit) {
int expectedJobsAlreadyOpenOnNode = (i - 1) / numNodes;
assertTrue(e.getMessage(),
e.getMessage().endsWith("because this node has insufficient available memory. Available memory for ML [" +
maxMlMemoryPerNode + "], memory required by existing jobs [" +
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) +
"], estimated memory required for this job [" + memoryFootprintPerJob + "]]"));
assertTrue(detailedMessage,
detailedMessage.endsWith("because this node has insufficient available memory. Available memory for ML [" +
maxMlMemoryPerNode + "], memory required by existing jobs [" +
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) + "], estimated memory required for this job [" +
memoryFootprintPerJob + "]]"));
} else {
assertTrue(e.getMessage(), e.getMessage().endsWith("because this node is full. Number of opened jobs [" +
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
assertTrue(detailedMessage, detailedMessage.endsWith("because this node is full. Number of opened jobs [" +
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
}
logger.info("good news everybody --> reached maximum number of allowed opened jobs, after trying to open the {}th job", i);

Expand Down