Skip to content

Commit

Permalink
[ML] Move open job failure explanation out of root cause (#31925)
Browse files Browse the repository at this point in the history
When an ML job cannot be allocated to a node the exception
contained an explanation of why the job couldn't be
allocated to each node in the cluster.  For large clusters
this was not particularly easy to read and made the error
displayed in the UI look very scary.

This commit changes the structure of the error to an outer
ElasticsearchException with a high level message and an
inner IllegalStateException containing the detailed
explanation.  Because the definition of root cause is the
innermost ElasticsearchException the detailed explanation
will not be the root cause (which is what Kibana displays).

Fixes #29950
  • Loading branch information
droberts195 authored Jul 13, 2018
1 parent c662565 commit d246164
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -678,10 +678,7 @@ public void validate(OpenJobAction.JobParams params, ClusterState clusterState)
PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(), clusterState,
maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs, maxMachineMemoryPercent, logger);
if (assignment.getExecutorNode() == null) {
String msg = "Could not open job because no suitable nodes were found, allocation explanation ["
+ assignment.getExplanation() + "]";
logger.warn("[{}] {}", params.getJobId(), msg);
throw new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);
throw makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
}
}

Expand Down Expand Up @@ -785,9 +782,9 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
// and this is why this class must only be used when opening a job
if (assignment != null && assignment.equals(PersistentTasksCustomMetaData.INITIAL_ASSIGNMENT) == false &&
assignment.isAssigned() == false) {
OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
// Assignment has failed on the master node despite passing our "fast fail" validation
exception = new ElasticsearchStatusException("Could not open job because no suitable nodes were found, " +
"allocation explanation [" + assignment.getExplanation() + "]", RestStatus.TOO_MANY_REQUESTS);
exception = makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
// The persistent task should be cancelled so that the observed outcome is the
// same as if the "fast fail" validation on the coordinating node had failed
shouldCancel = true;
Expand All @@ -813,4 +810,12 @@ public boolean test(PersistentTasksCustomMetaData.PersistentTask<?> persistentTa
}
}
}

static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
String msg = "Could not open job because no suitable nodes were found, allocation explanation [" + explanation + "]";
logger.warn("[{}] {}", jobId, msg);
Exception detail = new IllegalStateException(msg);
return new ElasticsearchStatusException("Could not open job because no ML nodes with sufficient capacity were found",
RestStatus.TOO_MANY_REQUESTS, detail);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,14 @@ public void testMlIndicesNotAvailable() throws Exception {

Exception e = expectThrows(ElasticsearchStatusException.class,
() -> client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet());
assertTrue(e.getMessage(),
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertTrue(e.getMessage(), e.getMessage().endsWith("because not all primary shards are active for the following indices "
+ "[.ml-state,.ml-anomalies-shared]]"));
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
IllegalStateException detail = (IllegalStateException) e.getCause();
assertNotNull(detail);
String detailedMessage = detail.getMessage();
assertTrue(detailedMessage,
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertTrue(detailedMessage, detailedMessage.endsWith("because not all primary shards are active for the following indices " +
"[.ml-state,.ml-anomalies-shared]]"));

logger.info("Start data node");
String nonMlNode = internalCluster().startNode(Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,22 @@ private void verifyMaxNumberOfJobsLimit(int numNodes, int maxNumberOfJobsPerNode
});
logger.info("Opened {}th job", i);
} catch (ElasticsearchStatusException e) {
assertTrue(e.getMessage(),
e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
IllegalStateException detail = (IllegalStateException) e.getCause();
assertNotNull(detail);
String detailedMessage = detail.getMessage();
assertTrue(detailedMessage,
detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
if (expectMemoryLimitBeforeCountLimit) {
int expectedJobsAlreadyOpenOnNode = (i - 1) / numNodes;
assertTrue(e.getMessage(),
e.getMessage().endsWith("because this node has insufficient available memory. Available memory for ML [" +
maxMlMemoryPerNode + "], memory required by existing jobs [" +
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) +
"], estimated memory required for this job [" + memoryFootprintPerJob + "]]"));
assertTrue(detailedMessage,
detailedMessage.endsWith("because this node has insufficient available memory. Available memory for ML [" +
maxMlMemoryPerNode + "], memory required by existing jobs [" +
(expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) + "], estimated memory required for this job [" +
memoryFootprintPerJob + "]]"));
} else {
assertTrue(e.getMessage(), e.getMessage().endsWith("because this node is full. Number of opened jobs [" +
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
assertTrue(detailedMessage, detailedMessage.endsWith("because this node is full. Number of opened jobs [" +
maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
}
logger.info("good news everybody --> reached maximum number of allowed opened jobs, after trying to open the {}th job", i);

Expand Down

0 comments on commit d246164

Please sign in to comment.