From 4da2e05bc89f08bc78377bd252ca003535910a07 Mon Sep 17 00:00:00 2001 From: kmg-stripe Date: Wed, 13 Nov 2024 15:02:45 -0800 Subject: [PATCH] Resubmit Worker Allocations (#725) A recent change removed worker resubmits when workers are stuck in accepted: https://github.com/Netflix/mantis/pull/719 Our underlying scheduler will not retry the allocations, so we need a way to conditionally enable the ability to resubmit. --- .../master/jobcluster/job/JobActor.java | 28 +++++++++++++++---- .../master/config/MasterConfiguration.java | 4 +++ .../master/scheduler/MantisScheduler.java | 7 +++++ .../scheduler/MantisSchedulerFactoryImpl.java | 3 +- .../ResourceClusterAwareScheduler.java | 6 ++++ .../jobcluster/job/JobTestMigrationTests.java | 5 ++++ .../master/scheduler/FakeMantisScheduler.java | 5 ++++ 7 files changed, 51 insertions(+), 7 deletions(-) diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java index 9186c0310..b894b4a80 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java @@ -1906,13 +1906,29 @@ public void checkHeartBeats(Instant currentTime) { for (JobWorker worker : stage.getAllWorkers()) { IMantisWorkerMetadata workerMeta = worker.getMetadata(); if (!workerMeta.getLastHeartbeatAt().isPresent()) { - // the worker is still waiting for resource allocation and the scheduler should take care of - // the retry logic. Instant acceptedAt = Instant.ofEpochMilli(workerMeta.getAcceptedAt()); - LOGGER.warn("Job {}, Worker {} stuck in accepted state since {}", - this.jobMgr.getJobId(), - workerMeta.getWorkerId(), - acceptedAt); + if(!scheduler.schedulerHandlesAllocationRetries()) { + // worker stuck in accepted and the scheduler will not retry allocation requests, so + // we must resubmit + if (Duration.between(acceptedAt, currentTime).getSeconds() > stuckInSubmitToleranceSecs) { + LOGGER.info("Resubmitting Job {}, Worker {} that has been stuck in accepted state for {}", this.jobMgr.getJobId(), + workerMeta.getWorkerId(), Duration.between(acceptedAt, currentTime).getSeconds()); + workersToResubmit.add(worker); + eventPublisher.publishStatusEvent(new LifecycleEventsProto.WorkerStatusEvent( + WARN, + "worker stuck in Accepted state, resubmitting worker", + workerMeta.getStageNum(), + workerMeta.getWorkerId(), + workerMeta.getState())); + } + } else { + // the worker is still waiting for resource allocation and the scheduler should take care of + // the retry logic. + LOGGER.warn("Job {}, Worker {} stuck in accepted state since {}", + this.jobMgr.getJobId(), + workerMeta.getWorkerId(), + acceptedAt); + } } else { if (Duration.between(workerMeta.getLastHeartbeatAt().get(), currentTime).getSeconds() > missedHeartBeatToleranceSecs) { diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/config/MasterConfiguration.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/config/MasterConfiguration.java index df50e6bef..289ca587f 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/config/MasterConfiguration.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/config/MasterConfiguration.java @@ -363,6 +363,10 @@ default Duration getSchedulerIntervalBetweenRetries() { @Default("3") int getSlaMaxHeadroomForAccepted(); + @Config("mantis.scheduler.handlesAllocationRetries") + @Default("true") + boolean getSchedulerHandlesAllocationRetries(); + default Duration getHeartbeatInterval() { return Duration.ofMillis(getHeartbeatIntervalInMs()); } diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisScheduler.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisScheduler.java index 8c41dbc1e..00273121b 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisScheduler.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisScheduler.java @@ -76,4 +76,11 @@ public interface MantisScheduler { */ void initializeRunningWorker(final ScheduleRequest scheduleRequest, final String hostname, final String hostID); + /** + * This should return true if the underlying scheduler handles retrying worker allocations. + * + * @return If there are not enough resources to schedule the worker and the scheduler automatically retries until + * the worker is assigned, then return true; otherwise, return false. + */ + boolean schedulerHandlesAllocationRetries(); } diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisSchedulerFactoryImpl.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisSchedulerFactoryImpl.java index ad7f5026f..20e7ca285 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisSchedulerFactoryImpl.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/MantisSchedulerFactoryImpl.java @@ -63,7 +63,8 @@ public MantisScheduler forClusterID(ClusterID clusterID) { executeStageRequestFactory, jobMessageRouter, metricsRegistry), - "scheduler-for-" + cid.getResourceID())); + "scheduler-for-" + cid.getResourceID()), + masterConfiguration.getSchedulerHandlesAllocationRetries()); }); } else { log.error("Scheduler gets unexpected null clusterID"); diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/ResourceClusterAwareScheduler.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/ResourceClusterAwareScheduler.java index e1ef9e268..669339037 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/ResourceClusterAwareScheduler.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/server/master/scheduler/ResourceClusterAwareScheduler.java @@ -32,6 +32,7 @@ public class ResourceClusterAwareScheduler implements MantisScheduler { private final ActorRef schedulerActor; + private final boolean handlesAllocationRetries; @Override public void scheduleWorkers(BatchScheduleRequest scheduleRequest) { @@ -67,4 +68,9 @@ public void initializeRunningWorker(ScheduleRequest scheduleRequest, String host new InitializeRunningWorkerRequestEvent(scheduleRequest, TaskExecutorID.of(hostID)), null); } + + @Override + public boolean schedulerHandlesAllocationRetries() { + return handlesAllocationRetries; + } } diff --git a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestMigrationTests.java b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestMigrationTests.java index d8f5430f2..e024e85f4 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestMigrationTests.java +++ b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestMigrationTests.java @@ -209,6 +209,11 @@ public void initializeRunningWorker(ScheduleRequest scheduleRequest, String host // TODO Auto-generated method stub } + + @Override + public boolean schedulerHandlesAllocationRetries(){ + return false; + } } public static void main(String[] args) { diff --git a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/scheduler/FakeMantisScheduler.java b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/scheduler/FakeMantisScheduler.java index 027fbe3b9..1dade66d3 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/scheduler/FakeMantisScheduler.java +++ b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/scheduler/FakeMantisScheduler.java @@ -64,4 +64,9 @@ public void updateWorkerSchedulingReadyTime(final WorkerId workerId, final long public void initializeRunningWorker(final ScheduleRequest scheduleRequest, final String hostname, final String hostID) { // no-op } + + @Override + public boolean schedulerHandlesAllocationRetries() { + return false; + } }