From 3a7a12538f9fe3a16dd85a41b330fd81e5d74607 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 17 Jul 2018 15:04:00 -0400 Subject: [PATCH 01/26] WIP: Prevent new host overloading --- .../config/SingularityConfiguration.java | 10 +++ .../mesos/SingularityMesosOfferScheduler.java | 64 ++++++++++++++----- ...ularitySlaveUsageWithCalculatedScores.java | 14 +++- .../scheduler/SingularityUsagePoller.java | 48 ++++++++++---- 4 files changed, 104 insertions(+), 32 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java index 5a05b1d64d..c65aa7062d 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java @@ -376,6 +376,8 @@ public class SingularityConfiguration extends Configuration { private long preemptibleTaskMaxExpectedRuntimeMs = 900000; // 15 minutes + private long maxSlaveUsageMetricAgeMs = 30000; + public long getAskDriverToKillTasksAgainAfterMillis() { return askDriverToKillTasksAgainAfterMillis; } @@ -1593,4 +1595,12 @@ public long getPreemptibleTaskMaxExpectedRuntimeMs() { public void setPreemptibleTaskMaxExpectedRuntimeMs(long preemptibleTaskMaxExpectedRuntimeMs) { this.preemptibleTaskMaxExpectedRuntimeMs = preemptibleTaskMaxExpectedRuntimeMs; } + + public long getMaxSlaveUsageMetricAgeMs() { + return maxSlaveUsageMetricAgeMs; + } + + public void setMaxSlaveUsageMetricAgeMs(long maxSlaveUsageMetricAgeMs) { + this.maxSlaveUsageMetricAgeMs = maxSlaveUsageMetricAgeMs; + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index c1d843a5d1..c8838f11c2 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -29,6 +29,7 @@ import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityDeployStatistics; import com.hubspot.singularity.SingularityPendingTaskId; +import com.hubspot.singularity.SingularitySlave; import com.hubspot.singularity.SingularitySlaveUsage; import com.hubspot.singularity.SingularitySlaveUsageWithId; import com.hubspot.singularity.SingularityTask; @@ -41,6 +42,7 @@ import com.hubspot.singularity.config.MesosConfiguration; import com.hubspot.singularity.config.SingularityConfiguration; import com.hubspot.singularity.data.DeployManager; +import com.hubspot.singularity.data.SlaveManager; import com.hubspot.singularity.data.TaskManager; import com.hubspot.singularity.data.UsageManager; import com.hubspot.singularity.helpers.MesosUtils; @@ -48,6 +50,7 @@ import com.hubspot.singularity.mesos.SingularitySlaveUsageWithCalculatedScores.MaxProbableUsage; import com.hubspot.singularity.scheduler.SingularityLeaderCache; import com.hubspot.singularity.scheduler.SingularityScheduler; +import com.hubspot.singularity.scheduler.SingularityUsagePoller; @Singleton public class SingularityMesosOfferScheduler { @@ -65,6 +68,8 @@ public class SingularityMesosOfferScheduler { private final SingularitySlaveAndRackManager slaveAndRackManager; private final SingularitySlaveAndRackHelper slaveAndRackHelper; private final SingularityTaskSizeOptimizer taskSizeOptimizer; + private final SingularityUsagePoller usagePoller; + private final SlaveManager slaveManager; private final UsageManager usageManager; private final DeployManager deployManager; private final SingularitySchedulerLock lock; @@ -89,6 +94,8 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration, SingularityTaskSizeOptimizer taskSizeOptimizer, SingularitySlaveAndRackHelper slaveAndRackHelper, SingularityLeaderCache leaderCache, + SingularityUsagePoller usagePoller, + SlaveManager slaveManager, UsageManager usageManager, DeployManager deployManager, SingularitySchedulerLock lock) { @@ -102,6 +109,8 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration, this.slaveAndRackManager = slaveAndRackManager; this.taskSizeOptimizer = taskSizeOptimizer; this.leaderCache = leaderCache; + this.usagePoller = usagePoller; + this.slaveManager = slaveManager; this.slaveAndRackHelper = slaveAndRackHelper; this.taskPrioritizer = taskPrioritizer; this.usageManager = usageManager; @@ -180,7 +189,8 @@ public Collection checkOffers(final Collection of mesosConfiguration.getScoreUsingSystemLoad(), getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(usageWithId.getSlaveId()).getSanitizedHost()), mesosConfiguration.getLoad5OverloadedThreshold(), - mesosConfiguration.getLoad1OverloadedThreshold() + mesosConfiguration.getLoad1OverloadedThreshold(), + usageWithId.getTimestamp() ) )); @@ -196,23 +206,43 @@ public Collection checkOffers(final Collection of List> scoringFutures = new ArrayList<>(); AtomicReference scoringException = new AtomicReference<>(null); for (SingularityOfferHolder offerHolder : offerHolders.values()) { - if (!isOfferFull(offerHolder)) { - scoringFutures.add( - offerScoringSemaphore.call( - () -> CompletableFuture.runAsync(() -> { - try { - double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId())); - if (score != 0) { - scorePerOffer.put(offerHolder.getSlaveId(), score); - } - } catch (Throwable t) { - LOG.error("Uncaught exception while scoring offers", t); - scoringException.set(t); - } - }, - offerScoringExecutor - ))); + if (isOfferFull(offerHolder)) { + continue; + } + Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(offerHolder.getSlaveId())); + + if (taskManager.getActiveTasks().stream() + .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() + && t.getMesosTask().getSlaveId().getValue().equals(offerHolder.getSlaveId()))) { + Optional maybeSlave = slaveManager.getSlave(offerHolder.getSlaveId()); + if (maybeSlave.isPresent()) { + usagePoller.getSlaveUsage(maybeSlave.get()); + } + continue; } + +// if (maybeSlaveUsage.isPresent() && System.currentTimeMillis() - maybeSlaveUsage.get().getTimestamp() > configuration.getMaxSlaveUsageMetricAgeMs()) { +// Optional maybeSlave = slaveManager.getSlave(offerHolder.getSlaveId()); +// if (maybeSlave.isPresent()) { +// usagePoller.getSlaveUsage(maybeSlave.get()); +// } +// continue; +// } + scoringFutures.add( + offerScoringSemaphore.call( + () -> CompletableFuture.runAsync(() -> { + try { + double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId())); + if (score != 0) { + scorePerOffer.put(offerHolder.getSlaveId(), score); + } + } catch (Throwable t) { + LOG.error("Uncaught exception while scoring offers", t); + scoringException.set(t); + } + }, + offerScoringExecutor + ))); } CompletableFutures.allOf(scoringFutures).join(); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java index 758aa832c4..b634fa068a 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java @@ -26,7 +26,14 @@ class SingularitySlaveUsageWithCalculatedScores { private final double load5Threshold; private final double load1Threshold; - SingularitySlaveUsageWithCalculatedScores(SingularitySlaveUsage slaveUsage, MachineLoadMetric systemLoadMetric, MaxProbableUsage maxProbableTaskUsage, double load5Threshold, double load1Threshold) { + private final long timestamp; + + SingularitySlaveUsageWithCalculatedScores(SingularitySlaveUsage slaveUsage, + MachineLoadMetric systemLoadMetric, + MaxProbableUsage maxProbableTaskUsage, + double load5Threshold, + double load1Threshold, + long timestamp) { this.slaveUsage = slaveUsage; this.systemLoadMetric = systemLoadMetric; this.maxProbableTaskUsage = maxProbableTaskUsage; @@ -39,6 +46,7 @@ class SingularitySlaveUsageWithCalculatedScores { } this.load5Threshold = load5Threshold; this.load1Threshold = load1Threshold; + this.timestamp = timestamp; } boolean isCpuOverloaded(double estimatedNumCpusToAdd) { @@ -121,6 +129,10 @@ SingularitySlaveUsage getSlaveUsage() { return diskInUseScore; } + long getTimestamp() { + return timestamp; + } + void addEstimatedCpuUsage(double estimatedAddedCpus) { this.estimatedAddedCpusUsage += estimatedAddedCpus; } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 011c9bb85e..6d1f6d7fcf 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -117,7 +117,7 @@ public void runActionOnPoll() { usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> { usageFutures.add(usageCollectionSemaphore.call(() -> CompletableFuture.runAsync(() -> { - collectSlaveUage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, + collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable); }, usageExecutor) )); @@ -126,8 +126,9 @@ public void runActionOnPoll() { CompletableFutures.allOf(usageFutures).join(); usageManager.saveClusterUtilization( - getClusterUtilization(utilizationPerRequestId, totalMemBytesUsed.get(), totalMemBytesAvailable.get(), totalCpuUsed.get(), totalCpuAvailable.get(), totalDiskBytesUsed.get(), totalDiskBytesAvailable - .get(), now)); + getClusterUtilization( + utilizationPerRequestId, totalMemBytesUsed.get(), totalMemBytesAvailable.get(), + totalCpuUsed.get(), totalCpuAvailable.get(), totalDiskBytesUsed.get(), totalDiskBytesAvailable.get(), now)); utilizationPerRequestId.values().forEach(usageManager::saveRequestUtilization); if (configuration.isShuffleTasksForOverloadedSlaves()) { @@ -135,6 +136,25 @@ public void runActionOnPoll() { } } + public CompletableFuture getSlaveUsage(SingularitySlave slave) { + return usageCollectionSemaphore.call(() -> + CompletableFuture.runAsync(() -> { + collectSlaveUsage( + slave, + System.currentTimeMillis(), + new ConcurrentHashMap<>(), + usageManager.getRequestUtilizations(), + new ConcurrentHashMap<>(), + new AtomicLong(), + new AtomicLong(), + new AtomicDouble(), + new AtomicDouble(), + new AtomicLong(), + new AtomicLong()); + }, usageExecutor) + ); + } + public void runWithRequestLock(Runnable function, String requestId) { ReentrantLock lock = requestLocks.computeIfAbsent(requestId, (r) -> new ReentrantLock()); lock.lock(); @@ -145,17 +165,17 @@ public void runWithRequestLock(Runnable function, String requestId) { } } - private void collectSlaveUage(SingularitySlave slave, - long now, - Map utilizationPerRequestId, - Map previousUtilizations, - Map> overLoadedHosts, - AtomicLong totalMemBytesUsed, - AtomicLong totalMemBytesAvailable, - AtomicDouble totalCpuUsed, - AtomicDouble totalCpuAvailable, - AtomicLong totalDiskBytesUsed, - AtomicLong totalDiskBytesAvailable) { + private void collectSlaveUsage(SingularitySlave slave, + long now, + Map utilizationPerRequestId, + Map previousUtilizations, + Map> overLoadedHosts, + AtomicLong totalMemBytesUsed, + AtomicLong totalMemBytesAvailable, + AtomicDouble totalCpuUsed, + AtomicDouble totalCpuAvailable, + AtomicLong totalDiskBytesUsed, + AtomicLong totalDiskBytesAvailable) { Optional memoryMbTotal = Optional.absent(); Optional cpusTotal = Optional.absent(); Optional diskMbTotal = Optional.absent(); From e610cf1ca46fed046c98cda0ae397fe1eae3890a Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Thu, 19 Jul 2018 16:56:13 -0400 Subject: [PATCH 02/26] Rework async stuff per PR --- .../mesos/SingularityMesosOfferScheduler.java | 95 +++++++++++-------- .../scheduler/SingularityUsagePoller.java | 18 ++-- 2 files changed, 68 insertions(+), 45 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index c8838f11c2..3beada3d5a 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -206,43 +206,11 @@ public Collection checkOffers(final Collection of List> scoringFutures = new ArrayList<>(); AtomicReference scoringException = new AtomicReference<>(null); for (SingularityOfferHolder offerHolder : offerHolders.values()) { - if (isOfferFull(offerHolder)) { - continue; - } - Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(offerHolder.getSlaveId())); - - if (taskManager.getActiveTasks().stream() - .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() - && t.getMesosTask().getSlaveId().getValue().equals(offerHolder.getSlaveId()))) { - Optional maybeSlave = slaveManager.getSlave(offerHolder.getSlaveId()); - if (maybeSlave.isPresent()) { - usagePoller.getSlaveUsage(maybeSlave.get()); - } - continue; - } - -// if (maybeSlaveUsage.isPresent() && System.currentTimeMillis() - maybeSlaveUsage.get().getTimestamp() > configuration.getMaxSlaveUsageMetricAgeMs()) { -// Optional maybeSlave = slaveManager.getSlave(offerHolder.getSlaveId()); -// if (maybeSlave.isPresent()) { -// usagePoller.getSlaveUsage(maybeSlave.get()); -// } -// continue; -// } - scoringFutures.add( - offerScoringSemaphore.call( - () -> CompletableFuture.runAsync(() -> { - try { - double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId())); - if (score != 0) { - scorePerOffer.put(offerHolder.getSlaveId(), score); - } - } catch (Throwable t) { - LOG.error("Uncaught exception while scoring offers", t); - scoringException.set(t); - } - }, - offerScoringExecutor - ))); + scoringFutures.add(offerScoringSemaphore.call(() -> + CompletableFuture.supplyAsync(() -> { + return buildScoringFuture(offerHolders, requestUtilizations, activeTaskIds, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder); + }, + offerScoringExecutor))); } CompletableFutures.allOf(scoringFutures).join(); @@ -270,6 +238,59 @@ public Collection checkOffers(final Collection of return offerHolders.values(); } + private Void buildScoringFuture( + Map offerHolders, + Map requestUtilizations, + List activeTaskIds, + Map currentSlaveUsagesBySlaveId, + Map tasksPerOfferHost, + SingularityTaskRequestHolder taskRequestHolder, + Map scorePerOffer, + List activeTaskIdsForRequest, + AtomicReference scoringException, + SingularityOfferHolder offerHolder) { + if (isOfferFull(offerHolder)) { + return null; + } + String slaveId = offerHolder.getSlaveId(); + Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(slaveId)); + + if (taskManager.getActiveTasks().stream() + .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() + && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { + Optional maybeSlave = slaveManager.getSlave(slaveId); + if (maybeSlave.isPresent()) { + usagePoller.getSlaveUsage(maybeSlave.get()) + .whenComplete((usage, throwable) -> { + if (throwable == null) { + currentSlaveUsagesBySlaveId.put(slaveId, new SingularitySlaveUsageWithCalculatedScores( + usage, + mesosConfiguration.getScoreUsingSystemLoad(), + getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(slaveId).getSanitizedHost()), + mesosConfiguration.getLoad5OverloadedThreshold(), + mesosConfiguration.getLoad1OverloadedThreshold(), + usage.getTimestamp() + )); + } else { + throw new RuntimeException(throwable); + } + }); + } + return null; + } + + try { + double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId())); + if (score != 0) { + scorePerOffer.put(slaveId, score); + } + } catch (Throwable t) { + LOG.error("Uncaught exception while scoring offers", t); + scoringException.set(t); + } + return null; + } + private MaxProbableUsage getMaxProbableUsageForSlave(List activeTaskIds, Map requestUtilizations, String sanitizedHostname) { double cpu = 0; double memBytes = 0; diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 6d1f6d7fcf..e9894db9fb 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -68,7 +68,7 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { private final DeployManager deployManager; private final TaskManager taskManager; - private final AsyncSemaphore usageCollectionSemaphore; + private final AsyncSemaphore usageCollectionSemaphore; private final ExecutorService usageExecutor; private final ConcurrentHashMap requestLocks; @@ -112,12 +112,12 @@ public void runActionOnPoll() { Map> overLoadedHosts = new ConcurrentHashMap<>(); - List> usageFutures = new ArrayList<>(); + List> usageFutures = new ArrayList<>(); usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> { usageFutures.add(usageCollectionSemaphore.call(() -> - CompletableFuture.runAsync(() -> { - collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, + CompletableFuture.supplyAsync(() -> { + return collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable); }, usageExecutor) )); @@ -136,10 +136,10 @@ public void runActionOnPoll() { } } - public CompletableFuture getSlaveUsage(SingularitySlave slave) { + public CompletableFuture getSlaveUsage(SingularitySlave slave) { return usageCollectionSemaphore.call(() -> - CompletableFuture.runAsync(() -> { - collectSlaveUsage( + CompletableFuture.supplyAsync(() -> { + return collectSlaveUsage( slave, System.currentTimeMillis(), new ConcurrentHashMap<>(), @@ -165,7 +165,7 @@ public void runWithRequestLock(Runnable function, String requestId) { } } - private void collectSlaveUsage(SingularitySlave slave, + private SingularitySlaveUsage collectSlaveUsage(SingularitySlave slave, long now, Map utilizationPerRequestId, Map previousUtilizations, @@ -334,11 +334,13 @@ private void collectSlaveUsage(SingularitySlave slave, LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); + return slaveUsage; } catch (Throwable t) { String message = String.format("Could not get slave usage for host %s", slave.getHost()); LOG.error(message, t); exceptionNotifier.notify(message, t); } + return null; // TODO: is this really okay? } private boolean isEligibleForShuffle(SingularityTaskId task) { From 339dd4491295a2a12d30b06126826667c76ab90c Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 24 Jul 2018 13:49:46 -0400 Subject: [PATCH 03/26] PR changes --- .../mesos/SingularityMesosOfferScheduler.java | 30 ++++++++---- ...ularitySlaveUsageWithCalculatedScores.java | 4 -- .../scheduler/SingularityUsagePoller.java | 48 ++++++------------- 3 files changed, 37 insertions(+), 45 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 3beada3d5a..11da1b23ac 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -10,6 +10,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.stream.Collectors; @@ -22,6 +23,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; +import com.google.common.util.concurrent.AtomicDouble; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.inject.Inject; import com.hubspot.mesos.Resources; @@ -208,7 +210,7 @@ public Collection checkOffers(final Collection of for (SingularityOfferHolder offerHolder : offerHolders.values()) { scoringFutures.add(offerScoringSemaphore.call(() -> CompletableFuture.supplyAsync(() -> { - return buildScoringFuture(offerHolders, requestUtilizations, activeTaskIds, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder); + return calculateScore(offerHolders, requestUtilizations, activeTaskIds, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder); }, offerScoringExecutor))); } @@ -238,7 +240,7 @@ public Collection checkOffers(final Collection of return offerHolders.values(); } - private Void buildScoringFuture( + private Void calculateScore( Map offerHolders, Map requestUtilizations, List activeTaskIds, @@ -256,27 +258,39 @@ private Void buildScoringFuture( Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(slaveId)); if (taskManager.getActiveTasks().stream() - .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() + .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getSlaveUsage().getTimestamp() && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { Optional maybeSlave = slaveManager.getSlave(slaveId); if (maybeSlave.isPresent()) { - usagePoller.getSlaveUsage(maybeSlave.get()) + CompletableFuture.supplyAsync(() -> + usagePoller.collectSlaveUsage( + maybeSlave.get(), + System.currentTimeMillis(), + new ConcurrentHashMap<>(), + usageManager.getRequestUtilizations(), + new ConcurrentHashMap<>(), + new AtomicLong(), + new AtomicLong(), + new AtomicDouble(), + new AtomicDouble(), + new AtomicLong(), + new AtomicLong()), + offerScoringExecutor) .whenComplete((usage, throwable) -> { - if (throwable == null) { + if (throwable == null && usage.isPresent()) { currentSlaveUsagesBySlaveId.put(slaveId, new SingularitySlaveUsageWithCalculatedScores( - usage, + usage.get(), mesosConfiguration.getScoreUsingSystemLoad(), getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(slaveId).getSanitizedHost()), mesosConfiguration.getLoad5OverloadedThreshold(), mesosConfiguration.getLoad1OverloadedThreshold(), - usage.getTimestamp() + usage.get().getTimestamp() )); } else { throw new RuntimeException(throwable); } }); } - return null; } try { diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java index b634fa068a..0c8ae4cef8 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularitySlaveUsageWithCalculatedScores.java @@ -129,10 +129,6 @@ SingularitySlaveUsage getSlaveUsage() { return diskInUseScore; } - long getTimestamp() { - return timestamp; - } - void addEstimatedCpuUsage(double estimatedAddedCpus) { this.estimatedAddedCpusUsage += estimatedAddedCpus; } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index e9894db9fb..9c9449f344 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -118,7 +118,7 @@ public void runActionOnPoll() { usageFutures.add(usageCollectionSemaphore.call(() -> CompletableFuture.supplyAsync(() -> { return collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, - totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable); + totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable).get(); }, usageExecutor) )); }); @@ -136,25 +136,6 @@ public void runActionOnPoll() { } } - public CompletableFuture getSlaveUsage(SingularitySlave slave) { - return usageCollectionSemaphore.call(() -> - CompletableFuture.supplyAsync(() -> { - return collectSlaveUsage( - slave, - System.currentTimeMillis(), - new ConcurrentHashMap<>(), - usageManager.getRequestUtilizations(), - new ConcurrentHashMap<>(), - new AtomicLong(), - new AtomicLong(), - new AtomicDouble(), - new AtomicDouble(), - new AtomicLong(), - new AtomicLong()); - }, usageExecutor) - ); - } - public void runWithRequestLock(Runnable function, String requestId) { ReentrantLock lock = requestLocks.computeIfAbsent(requestId, (r) -> new ReentrantLock()); lock.lock(); @@ -165,17 +146,18 @@ public void runWithRequestLock(Runnable function, String requestId) { } } - private SingularitySlaveUsage collectSlaveUsage(SingularitySlave slave, - long now, - Map utilizationPerRequestId, - Map previousUtilizations, - Map> overLoadedHosts, - AtomicLong totalMemBytesUsed, - AtomicLong totalMemBytesAvailable, - AtomicDouble totalCpuUsed, - AtomicDouble totalCpuAvailable, - AtomicLong totalDiskBytesUsed, - AtomicLong totalDiskBytesAvailable) { + public Optional collectSlaveUsage( + SingularitySlave slave, + long now, + Map utilizationPerRequestId, + Map previousUtilizations, + Map> overLoadedHosts, + AtomicLong totalMemBytesUsed, + AtomicLong totalMemBytesAvailable, + AtomicDouble totalCpuUsed, + AtomicDouble totalCpuAvailable, + AtomicLong totalDiskBytesUsed, + AtomicLong totalDiskBytesAvailable) { Optional memoryMbTotal = Optional.absent(); Optional cpusTotal = Optional.absent(); Optional diskMbTotal = Optional.absent(); @@ -334,13 +316,13 @@ private SingularitySlaveUsage collectSlaveUsage(SingularitySlave slave, LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); - return slaveUsage; + return Optional.of(slaveUsage); } catch (Throwable t) { String message = String.format("Could not get slave usage for host %s", slave.getHost()); LOG.error(message, t); exceptionNotifier.notify(message, t); } - return null; // TODO: is this really okay? + return Optional.absent(); } private boolean isEligibleForShuffle(SingularityTaskId task) { From d814a9d62f2a2dad401c45042fcde1a2fe6898ec Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 31 Jul 2018 12:55:01 -0400 Subject: [PATCH 04/26] Move usage collection loop --- .../mesos/SingularityMesosOfferScheduler.java | 94 +++++++++---------- .../scheduler/SingularityUsagePoller.java | 15 +++ 2 files changed, 57 insertions(+), 52 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 11da1b23ac..cc27af9899 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -10,7 +10,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.stream.Collectors; @@ -23,7 +22,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; -import com.google.common.util.concurrent.AtomicDouble; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.inject.Inject; import com.hubspot.mesos.Resources; @@ -178,23 +176,54 @@ public Collection checkOffers(final Collection of Map requestUtilizations = usageManager.getRequestUtilizations(); List activeTaskIds = taskManager.getActiveTaskIds(); - final Map currentSlaveUsagesBySlaveId = usageManager.getCurrentSlaveUsages( + Map currentSlaveUsages = usageManager.getCurrentSlaveUsages( offerHolders.values() .stream() .map(SingularityOfferHolder::getSlaveId) .collect(Collectors.toList())) - .parallelStream() - .collect(Collectors.toMap( - SingularitySlaveUsageWithId::getSlaveId, - (usageWithId) -> new SingularitySlaveUsageWithCalculatedScores( - usageWithId, + .stream() + .collect(Collectors.toMap(SingularitySlaveUsageWithId::getSlaveId, Function.identity())); + + List> currentSlaveUsagesFutures = new ArrayList<>(); + for (SingularityOfferHolder offerHolder : offerHolders.values()) { + currentSlaveUsagesFutures.add(CompletableFuture.runAsync(() -> { + String slaveId = offerHolder.getSlaveId(); + Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); + + if (taskManager.getActiveTasks().stream() + .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() + && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { + Optional maybeSlave = slaveManager.getSlave(slaveId); + if (maybeSlave.isPresent()) { + currentSlaveUsages.put( + slaveId, + new SingularitySlaveUsageWithId(usagePoller.collectSlaveUsage( + maybeSlave.get(), + System.currentTimeMillis(), + usageManager.getRequestUtilizations()).get(), slaveId)); + } + } + }, offerScoringExecutor)); + } + CompletableFutures.allOf(currentSlaveUsagesFutures).join(); + + List> usagesWithScoresFutures = new ArrayList<>(); + Map currentSlaveUsagesBySlaveId = new ConcurrentHashMap<>(); + for (SingularitySlaveUsageWithId usage : currentSlaveUsages.values()) { + usagesWithScoresFutures.add( + CompletableFuture.runAsync(() -> currentSlaveUsagesBySlaveId.put(usage.getSlaveId(), + new SingularitySlaveUsageWithCalculatedScores( + usage, mesosConfiguration.getScoreUsingSystemLoad(), - getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(usageWithId.getSlaveId()).getSanitizedHost()), + getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(usage.getSlaveId()).getSanitizedHost()), mesosConfiguration.getLoad5OverloadedThreshold(), mesosConfiguration.getLoad1OverloadedThreshold(), - usageWithId.getTimestamp() - ) - )); + usage.getTimestamp())), + offerScoringExecutor) + ); + } + + CompletableFutures.allOf(usagesWithScoresFutures).join(); LOG.trace("Found slave usages {}", currentSlaveUsagesBySlaveId); @@ -210,7 +239,7 @@ public Collection checkOffers(final Collection of for (SingularityOfferHolder offerHolder : offerHolders.values()) { scoringFutures.add(offerScoringSemaphore.call(() -> CompletableFuture.supplyAsync(() -> { - return calculateScore(offerHolders, requestUtilizations, activeTaskIds, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder); + return calculateScore(requestUtilizations, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, scoringException, offerHolder); }, offerScoringExecutor))); } @@ -241,9 +270,7 @@ public Collection checkOffers(final Collection of } private Void calculateScore( - Map offerHolders, Map requestUtilizations, - List activeTaskIds, Map currentSlaveUsagesBySlaveId, Map tasksPerOfferHost, SingularityTaskRequestHolder taskRequestHolder, @@ -255,43 +282,6 @@ private Void calculateScore( return null; } String slaveId = offerHolder.getSlaveId(); - Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsagesBySlaveId.get(slaveId)); - - if (taskManager.getActiveTasks().stream() - .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getSlaveUsage().getTimestamp() - && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { - Optional maybeSlave = slaveManager.getSlave(slaveId); - if (maybeSlave.isPresent()) { - CompletableFuture.supplyAsync(() -> - usagePoller.collectSlaveUsage( - maybeSlave.get(), - System.currentTimeMillis(), - new ConcurrentHashMap<>(), - usageManager.getRequestUtilizations(), - new ConcurrentHashMap<>(), - new AtomicLong(), - new AtomicLong(), - new AtomicDouble(), - new AtomicDouble(), - new AtomicLong(), - new AtomicLong()), - offerScoringExecutor) - .whenComplete((usage, throwable) -> { - if (throwable == null && usage.isPresent()) { - currentSlaveUsagesBySlaveId.put(slaveId, new SingularitySlaveUsageWithCalculatedScores( - usage.get(), - mesosConfiguration.getScoreUsingSystemLoad(), - getMaxProbableUsageForSlave(activeTaskIds, requestUtilizations, offerHolders.get(slaveId).getSanitizedHost()), - mesosConfiguration.getLoad5OverloadedThreshold(), - mesosConfiguration.getLoad1OverloadedThreshold(), - usage.get().getTimestamp() - )); - } else { - throw new RuntimeException(throwable); - } - }); - } - } try { double score = calculateScore(offerHolder, currentSlaveUsagesBySlaveId, tasksPerOfferHost, taskRequestHolder, activeTaskIdsForRequest, requestUtilizations.get(taskRequestHolder.getTaskRequest().getRequest().getId())); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 9c9449f344..83001254c6 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -146,6 +146,21 @@ public void runWithRequestLock(Runnable function, String requestId) { } } + public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations) { + return collectSlaveUsage( + slave, + now, + new ConcurrentHashMap<>(), + previousUtilizations, + new ConcurrentHashMap<>(), + new AtomicLong(), + new AtomicLong(), + new AtomicDouble(), + new AtomicDouble(), + new AtomicLong(), + new AtomicLong()); + } + public Optional collectSlaveUsage( SingularitySlave slave, long now, From 50ba080c74712528767a85ca8e47bcb9d8382e5c Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 31 Jul 2018 14:05:08 -0400 Subject: [PATCH 05/26] Use semaphore --- .../singularity/mesos/SingularityMesosOfferScheduler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index cc27af9899..22b66f71fc 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -186,7 +186,7 @@ public Collection checkOffers(final Collection of List> currentSlaveUsagesFutures = new ArrayList<>(); for (SingularityOfferHolder offerHolder : offerHolders.values()) { - currentSlaveUsagesFutures.add(CompletableFuture.runAsync(() -> { + currentSlaveUsagesFutures.add(offerScoringSemaphore.call(() -> CompletableFuture.runAsync(() -> { String slaveId = offerHolder.getSlaveId(); Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); @@ -203,14 +203,14 @@ public Collection checkOffers(final Collection of usageManager.getRequestUtilizations()).get(), slaveId)); } } - }, offerScoringExecutor)); + }, offerScoringExecutor))); } CompletableFutures.allOf(currentSlaveUsagesFutures).join(); List> usagesWithScoresFutures = new ArrayList<>(); Map currentSlaveUsagesBySlaveId = new ConcurrentHashMap<>(); for (SingularitySlaveUsageWithId usage : currentSlaveUsages.values()) { - usagesWithScoresFutures.add( + usagesWithScoresFutures.add(offerScoringSemaphore.call(() -> CompletableFuture.runAsync(() -> currentSlaveUsagesBySlaveId.put(usage.getSlaveId(), new SingularitySlaveUsageWithCalculatedScores( usage, @@ -219,7 +219,7 @@ public Collection checkOffers(final Collection of mesosConfiguration.getLoad5OverloadedThreshold(), mesosConfiguration.getLoad1OverloadedThreshold(), usage.getTimestamp())), - offerScoringExecutor) + offerScoringExecutor)) ); } From 39b3535106560aacaf3ce05918edac04b81bdda2 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 31 Jul 2018 14:11:40 -0400 Subject: [PATCH 06/26] Fix tests --- .../singularity/mesos/SingularityMesosOfferSchedulerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SingularityService/src/test/java/com/hubspot/singularity/mesos/SingularityMesosOfferSchedulerTest.java b/SingularityService/src/test/java/com/hubspot/singularity/mesos/SingularityMesosOfferSchedulerTest.java index 94396f19a1..4d0153622d 100644 --- a/SingularityService/src/test/java/com/hubspot/singularity/mesos/SingularityMesosOfferSchedulerTest.java +++ b/SingularityService/src/test/java/com/hubspot/singularity/mesos/SingularityMesosOfferSchedulerTest.java @@ -251,7 +251,7 @@ private SingularitySlaveUsageWithCalculatedScores getUsage(long memMbReserved, cpusTotal, cpuInUse, cpuInUse, cpuInUse, diskMbInUse * SingularitySlaveUsage.BYTES_PER_MEGABYTE, diskMbTotal * SingularitySlaveUsage.BYTES_PER_MEGABYTE), MachineLoadMetric.LOAD_5, new MaxProbableUsage(0, 0, 0), - 0, 0 + 0, 0, System.currentTimeMillis() ); } From bee226973fc4bc6295a70691850254802534d47c Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 31 Jul 2018 15:23:16 -0400 Subject: [PATCH 07/26] Resolve dependencies --- .../mesos/SingularityMesosOfferScheduler.java | 10 +- .../scheduler/SingularityUsageHelper.java | 452 +++++++++++++++++- .../scheduler/SingularityUsagePoller.java | 428 +---------------- .../scheduler/TaskIdWithUsage.java | 29 ++ .../scheduler/SingularityUsageTest.java | 5 +- 5 files changed, 490 insertions(+), 434 deletions(-) create mode 100644 SingularityService/src/main/java/com/hubspot/singularity/scheduler/TaskIdWithUsage.java diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 22b66f71fc..4c13699af4 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -50,7 +50,7 @@ import com.hubspot.singularity.mesos.SingularitySlaveUsageWithCalculatedScores.MaxProbableUsage; import com.hubspot.singularity.scheduler.SingularityLeaderCache; import com.hubspot.singularity.scheduler.SingularityScheduler; -import com.hubspot.singularity.scheduler.SingularityUsagePoller; +import com.hubspot.singularity.scheduler.SingularityUsageHelper; @Singleton public class SingularityMesosOfferScheduler { @@ -68,7 +68,7 @@ public class SingularityMesosOfferScheduler { private final SingularitySlaveAndRackManager slaveAndRackManager; private final SingularitySlaveAndRackHelper slaveAndRackHelper; private final SingularityTaskSizeOptimizer taskSizeOptimizer; - private final SingularityUsagePoller usagePoller; + private final SingularityUsageHelper usageHelper; private final SlaveManager slaveManager; private final UsageManager usageManager; private final DeployManager deployManager; @@ -94,7 +94,7 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration, SingularityTaskSizeOptimizer taskSizeOptimizer, SingularitySlaveAndRackHelper slaveAndRackHelper, SingularityLeaderCache leaderCache, - SingularityUsagePoller usagePoller, + SingularityUsageHelper usageHelper, SlaveManager slaveManager, UsageManager usageManager, DeployManager deployManager, @@ -109,7 +109,7 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration, this.slaveAndRackManager = slaveAndRackManager; this.taskSizeOptimizer = taskSizeOptimizer; this.leaderCache = leaderCache; - this.usagePoller = usagePoller; + this.usageHelper = usageHelper; this.slaveManager = slaveManager; this.slaveAndRackHelper = slaveAndRackHelper; this.taskPrioritizer = taskPrioritizer; @@ -197,7 +197,7 @@ public Collection checkOffers(final Collection of if (maybeSlave.isPresent()) { currentSlaveUsages.put( slaveId, - new SingularitySlaveUsageWithId(usagePoller.collectSlaveUsage( + new SingularitySlaveUsageWithId(usageHelper.collectSlaveUsage( maybeSlave.get(), System.currentTimeMillis(), usageManager.getRequestUtilizations()).get(), slaveId)); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 089dd73907..362d8a4177 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -1,24 +1,82 @@ package com.hubspot.singularity.scheduler; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantLock; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Optional; +import com.google.common.util.concurrent.AtomicDouble; import com.google.inject.Inject; import com.google.inject.Singleton; +import com.hubspot.mesos.Resources; +import com.hubspot.mesos.client.MesosClient; +import com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject; +import com.hubspot.mesos.json.MesosTaskMonitorObject; +import com.hubspot.singularity.ExtendedTaskState; +import com.hubspot.singularity.InvalidSingularityTaskIdException; import com.hubspot.singularity.MachineState; +import com.hubspot.singularity.RequestUtilization; +import com.hubspot.singularity.SingularityDeleteResult; +import com.hubspot.singularity.SingularityRequestWithState; import com.hubspot.singularity.SingularitySlave; +import com.hubspot.singularity.SingularitySlaveUsage; +import com.hubspot.singularity.SingularityTask; +import com.hubspot.singularity.SingularityTaskCurrentUsage; +import com.hubspot.singularity.SingularityTaskHistoryUpdate; +import com.hubspot.singularity.SingularityTaskId; +import com.hubspot.singularity.SingularityTaskUsage; +import com.hubspot.singularity.TaskCleanupType; +import com.hubspot.singularity.config.SingularityConfiguration; +import com.hubspot.singularity.data.RequestManager; import com.hubspot.singularity.data.SlaveManager; +import com.hubspot.singularity.data.TaskManager; +import com.hubspot.singularity.data.UsageManager; +import com.hubspot.singularity.sentry.SingularityExceptionNotifier; @Singleton public class SingularityUsageHelper { + private static final Logger LOG = LoggerFactory.getLogger(SingularityUsageHelper.class); + private static final long DAY_IN_SECONDS = TimeUnit.DAYS.toSeconds(1); + private final MesosClient mesosClient; + private final SingularityConfiguration configuration; + private final SingularityExceptionNotifier exceptionNotifier; + private final RequestManager requestManager; private final SlaveManager slaveManager; + private final TaskManager taskManager; + private final UsageManager usageManager; + + private final ConcurrentHashMap requestLocks; @Inject - public SingularityUsageHelper(SlaveManager slaveManager) { + public SingularityUsageHelper( + MesosClient mesosClient, + SingularityConfiguration configuration, + SingularityExceptionNotifier exceptionNotifier, + RequestManager requestManager, + SlaveManager slaveManager, + TaskManager taskManager, + UsageManager usageManager) { + this.mesosClient = mesosClient; + this.configuration = configuration; + this.exceptionNotifier = exceptionNotifier; + this.requestManager = requestManager; this.slaveManager = slaveManager; + this.taskManager = taskManager; + this.usageManager = usageManager; + + this.requestLocks = new ConcurrentHashMap<>(); } public Set getSlaveIdsToTrackUsageFor() { @@ -45,4 +103,396 @@ public List getSlavesToTrackUsageFor() { return slavesToTrack; } + + + + public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations) { + return collectSlaveUsage( + slave, + now, + new ConcurrentHashMap<>(), + previousUtilizations, + new ConcurrentHashMap<>(), + new AtomicLong(), + new AtomicLong(), + new AtomicDouble(), + new AtomicDouble(), + new AtomicLong(), + new AtomicLong()); + } + + public Optional collectSlaveUsage( + SingularitySlave slave, + long now, + Map utilizationPerRequestId, + Map previousUtilizations, + Map> overLoadedHosts, + AtomicLong totalMemBytesUsed, + AtomicLong totalMemBytesAvailable, + AtomicDouble totalCpuUsed, + AtomicDouble totalCpuAvailable, + AtomicLong totalDiskBytesUsed, + AtomicLong totalDiskBytesAvailable) { + Optional memoryMbTotal = Optional.absent(); + Optional cpusTotal = Optional.absent(); + Optional diskMbTotal = Optional.absent(); + + long memoryMbReservedOnSlave = 0; + double cpuReservedOnSlave = 0; + long diskMbReservedOnSlave = 0; + + long memoryBytesUsedOnSlave = 0; + double cpusUsedOnSlave = 0; + long diskMbUsedOnSlave = 0; + + try { + List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost()); + MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); + double systemMemTotalBytes = 0; + double systemMemFreeBytes = 0; + double systemLoad1Min = 0; + double systemLoad5Min = 0; + double systemLoad15Min = 0; + double slaveDiskUsed = 0; + double slaveDiskTotal = 0; + double systemCpusTotal = 0; + if (slaveMetricsSnapshot != null) { + systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes(); + systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes(); + systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min(); + systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min(); + systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min(); + slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); + slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); + systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); + } + + double systemLoad; + switch (configuration.getMesosConfiguration().getScoreUsingSystemLoad()) { + case LOAD_1: + systemLoad = systemLoad1Min; + break; + case LOAD_15: + systemLoad = systemLoad15Min; + break; + case LOAD_5: + default: + systemLoad = systemLoad5Min; + break; + } + + boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0; + List possibleTasksToShuffle = new ArrayList<>(); + + for (MesosTaskMonitorObject taskUsage : allTaskUsage) { + String taskId = taskUsage.getSource(); + SingularityTaskId task; + try { + task = SingularityTaskId.valueOf(taskId); + } catch (InvalidSingularityTaskIdException e) { + LOG.error("Couldn't get SingularityTaskId for {}", taskUsage); + continue; + } + + SingularityTaskUsage latestUsage = getUsage(taskUsage); + List pastTaskUsages = usageManager.getTaskUsage(taskId); + + + clearOldUsage(taskId); + usageManager.saveSpecificTaskUsage(taskId, latestUsage); + + Optional maybeTask = taskManager.getTask(task); + Optional maybeResources = Optional.absent(); + if (maybeTask.isPresent()) { + maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources()); + if (maybeResources.isPresent()) { + Resources taskResources = maybeResources.get(); + double memoryMbReservedForTask = taskResources.getMemoryMb(); + double cpuReservedForTask = taskResources.getCpus(); + double diskMbReservedForTask = taskResources.getDiskMb(); + + memoryMbReservedOnSlave += memoryMbReservedForTask; + cpuReservedOnSlave += cpuReservedForTask; + diskMbReservedOnSlave += diskMbReservedForTask; + + runWithRequestLock(() -> updateRequestUtilization(utilizationPerRequestId, previousUtilizations, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask), task.getRequestId()); + } + } + memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes(); + diskMbUsedOnSlave += latestUsage.getDiskTotalBytes(); + + SingularityTaskCurrentUsage currentUsage = null; + if (pastTaskUsages.isEmpty()) { + Optional maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING); + if (maybeStartingUpdate.isPresent()) { + long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp()); + double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds); + currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes()); + usageManager.saveCurrentTaskUsage(taskId, currentUsage); + + cpusUsedOnSlave += usedCpusSinceStart; + } + } else { + SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1); + + double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp())); + + currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes()); + usageManager.saveCurrentTaskUsage(taskId, currentUsage); + cpusUsedOnSlave += taskCpusUsed; + } + + if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) { + if (isEligibleForShuffle(task)) { + Optional maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING); + if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) { + LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId); + } else { + if (maybeResources.isPresent()) { + possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage)); + } + } + } + } + } + + if (!slave.getResources().isPresent() || + !slave.getResources().get().getMemoryMegaBytes().isPresent() || + !slave.getResources().get().getNumCpus().isPresent()) { + LOG.debug("Could not find slave or resources for slave {}", slave.getId()); + } else { + memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue()); + cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue()); + diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); + } + + SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, + memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, + systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal); + + if (slaveOverloaded) { + overLoadedHosts.put(slaveUsage, possibleTasksToShuffle); + } + + List slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId()); + if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) { + usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0)); + } + + if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) { + totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed()); + totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed()); + totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed()); + + totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get()); + totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get()); + totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get()); + } + + LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); + usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); + return Optional.of(slaveUsage); + } catch (Throwable t) { + String message = String.format("Could not get slave usage for host %s", slave.getHost()); + LOG.error(message, t); + exceptionNotifier.notify(message, t); + } + return Optional.absent(); + } + + private SingularityTaskUsage getUsage(MesosTaskMonitorObject taskUsage) { + return new SingularityTaskUsage( + taskUsage.getStatistics().getMemTotalBytes(), + taskUsage.getStatistics().getTimestamp(), + taskUsage.getStatistics().getCpusSystemTimeSecs() + taskUsage.getStatistics().getCpusUserTimeSecs(), + taskUsage.getStatistics().getDiskUsedBytes(), + taskUsage.getStatistics().getCpusNrPeriods(), + taskUsage.getStatistics().getCpusNrThrottled(), + taskUsage.getStatistics().getCpusThrottledTimeSecs()); + } + + private List getFullListOfTaskUsages(List pastTaskUsages, SingularityTaskUsage latestUsage, SingularityTaskId task) { + List pastTaskUsagesCopy = new ArrayList<>(); + pastTaskUsagesCopy.add(new SingularityTaskUsage(0, TimeUnit.MILLISECONDS.toSeconds(task.getStartedAt()), 0, 0, 0 , 0, 0)); // to calculate oldest cpu usage + pastTaskUsagesCopy.addAll(pastTaskUsages); + pastTaskUsagesCopy.add(latestUsage); + + return pastTaskUsagesCopy; + } + + + private boolean isEligibleForShuffle(SingularityTaskId task) { + Optional taskRunning = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_RUNNING); + + return ( + !configuration.getDoNotShuffleRequests().contains(task.getRequestId()) + && isLongRunning(task) + && ( + configuration.getMinutesBeforeNewTaskEligibleForShuffle() == 0 // Shuffle delay is disabled entirely + || (taskRunning.isPresent() && TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - taskRunning.get() + .getTimestamp()) >= configuration.getMinutesBeforeNewTaskEligibleForShuffle()) + ) + ); + } + + private boolean isLongRunning(SingularityTaskId task) { + Optional request = requestManager.getRequest(task.getRequestId()); + if (request.isPresent()) { + return request.get().getRequest().getRequestType().isLongRunning(); + } + + LOG.warn("Couldn't find request id {} for task {}", task.getRequestId(), task.getId()); + return false; + } + + private boolean isTaskAlreadyCleanedUpForShuffle(SingularityTaskHistoryUpdate taskHistoryUpdate) { + if (taskHistoryUpdate.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) { + return true; + } + for (SingularityTaskHistoryUpdate previous : taskHistoryUpdate.getPrevious()) { + if (previous.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) { + return true; + } + } + return false; + } + + private void updateRequestUtilization(Map utilizationPerRequestId, + Map previousUtilizations, + List pastTaskUsages, + SingularityTaskUsage latestUsage, + SingularityTaskId task, + double memoryMbReservedForTask, + double cpuReservedForTask, + double diskMbReservedForTask) { + String requestId = task.getRequestId(); + RequestUtilization newRequestUtilization = utilizationPerRequestId.getOrDefault(requestId, new RequestUtilization(requestId, task.getDeployId())); + RequestUtilization previous = previousUtilizations.get(requestId); + // Take the previous request utilization into account to better measure 24 hour max/min values + if (previous != null) { + if (previous.getMaxMemTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMaxMemBytesUsed(previous.getMaxMemBytesUsed()); + newRequestUtilization.setMaxMemTimestamp(previous.getMaxMemTimestamp()); + } + if (previous.getMinMemTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMinMemBytesUsed(previous.getMinMemBytesUsed()); + newRequestUtilization.setMinMemTimestamp(previous.getMinMemTimestamp()); + } + if (previous.getMaxCpusTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMaxCpuUsed(previous.getMaxCpuUsed()); + newRequestUtilization.setMaxCpusTimestamp(previous.getMaxCpusTimestamp()); + } + if (previous.getMinCpusTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMinCpuUsed(previous.getMinCpuUsed()); + newRequestUtilization.setMinCpusTimestamp(previous.getMinCpusTimestamp()); + } + if (previous.getMaxDiskTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMaxDiskBytesUsed(previous.getMaxDiskBytesUsed()); + newRequestUtilization.setMaxDiskTimestamp(previous.getMaxDiskTimestamp()); + } + if (previous.getMinDiskTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMinDiskBytesUsed(previous.getMinDiskBytesUsed()); + newRequestUtilization.setMinDiskTimestamp(previous.getMinDiskTimestamp()); + } + if (previous.getMaxCpuThrottledTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMaxPercentCpuTimeThrottled(previous.getMaxPercentCpuTimeThrottled()); + newRequestUtilization.setMaxCpuThrottledTimestamp(previous.getMaxCpuThrottledTimestamp()); + } + if (previous.getMinCpuThrottledTimestamp() < DAY_IN_SECONDS) { + newRequestUtilization.setMinPercentCpuTimeThrottled(previous.getMinPercentCpuTimeThrottled()); + newRequestUtilization.setMinCpuThrottledTimestamp(previous.getMinCpuThrottledTimestamp()); + } + } + + List pastTaskUsagesCopy = getFullListOfTaskUsages(pastTaskUsages, latestUsage, task); + pastTaskUsagesCopy.sort(Comparator.comparingDouble(SingularityTaskUsage::getTimestamp)); + int numTasks = pastTaskUsagesCopy.size() - 1; // One usage is a fake 0 usage to calculate first cpu times + + int numCpuOverages = 0; + + for (int i = 0; i < numTasks; i++) { + SingularityTaskUsage olderUsage = pastTaskUsagesCopy.get(i); + SingularityTaskUsage newerUsage = pastTaskUsagesCopy.get(i + 1); + double cpusUsed = (newerUsage.getCpuSeconds() - olderUsage.getCpuSeconds()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp()); + double percentCpuTimeThrottled = (newerUsage.getCpusThrottledTimeSecs() - olderUsage.getCpusThrottledTimeSecs()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp()); + + if (cpusUsed > newRequestUtilization.getMaxCpuUsed()) { + newRequestUtilization.setMaxCpuUsed(cpusUsed); + newRequestUtilization.setMaxCpusTimestamp(newerUsage.getTimestamp()); + } + if (cpusUsed < newRequestUtilization.getMinCpuUsed()) { + newRequestUtilization.setMinCpuUsed(cpusUsed); + newRequestUtilization.setMinCpusTimestamp(newerUsage.getTimestamp()); + } + if (newerUsage.getMemoryTotalBytes() > newRequestUtilization.getMaxMemBytesUsed()) { + newRequestUtilization.setMaxMemBytesUsed(newerUsage.getMemoryTotalBytes()); + newRequestUtilization.setMaxMemTimestamp(newerUsage.getTimestamp()); + } + if (newerUsage.getMemoryTotalBytes() < newRequestUtilization.getMinMemBytesUsed()) { + newRequestUtilization.setMinMemBytesUsed(newerUsage.getMemoryTotalBytes()); + newRequestUtilization.setMinMemTimestamp(newerUsage.getTimestamp()); + } + if (newerUsage.getDiskTotalBytes() > newRequestUtilization.getMaxDiskBytesUsed()) { + newRequestUtilization.setMaxDiskBytesUsed(newerUsage.getDiskTotalBytes()); + newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp()); + } + if (newerUsage.getDiskTotalBytes() < newRequestUtilization.getMinDiskBytesUsed()) { + newRequestUtilization.setMinDiskBytesUsed(newerUsage.getDiskTotalBytes()); + newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp()); + } + if (percentCpuTimeThrottled > newRequestUtilization.getMaxPercentCpuTimeThrottled()) { + newRequestUtilization.setMaxPercentCpuTimeThrottled(percentCpuTimeThrottled); + newRequestUtilization.setMaxCpuThrottledTimestamp(newerUsage.getTimestamp()); + } + if (percentCpuTimeThrottled < newRequestUtilization.getMinPercentCpuTimeThrottled()) { + newRequestUtilization.setMinPercentCpuTimeThrottled(percentCpuTimeThrottled); + newRequestUtilization.setMinCpuThrottledTimestamp(newerUsage.getTimestamp()); + } + + if (cpusUsed > cpuReservedForTask) { + numCpuOverages++; + } + + newRequestUtilization + .addCpuUsed(cpusUsed) + .addMemBytesUsed(newerUsage.getMemoryTotalBytes()) + .addPercentCpuTimeThrottled(percentCpuTimeThrottled) + .addDiskBytesUsed(newerUsage.getDiskTotalBytes()) + .incrementTaskCount(); + } + + double cpuBurstRating = pastTaskUsagesCopy.size() > 0 ? numCpuOverages / (double) pastTaskUsagesCopy.size() : 1; + + newRequestUtilization + .addMemBytesReserved((long) (memoryMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks)) + .addCpuReserved(cpuReservedForTask * numTasks) + .addDiskBytesReserved((long) diskMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks) + .setCpuBurstRating(cpuBurstRating); + + utilizationPerRequestId.put(requestId, newRequestUtilization); + } + + @VisibleForTesting + void clearOldUsage(String taskId) { + usageManager.getTaskUsagePaths(taskId) + .stream() + .map(Double::parseDouble) + .skip(configuration.getNumUsageToKeep()) + .forEach((pathId) -> { + SingularityDeleteResult result = usageManager.deleteSpecificTaskUsage(taskId, pathId); + if (result.equals(SingularityDeleteResult.DIDNT_EXIST)) { + LOG.warn("Didn't delete taskUsage {} for taskId {}", pathId.toString(), taskId); + } + }); + } + + public void runWithRequestLock(Runnable function, String requestId) { + ReentrantLock lock = requestLocks.computeIfAbsent(requestId, (r) -> new ReentrantLock()); + lock.lock(); + try { + function.run(); + } finally { + lock.unlock(); + } + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 83001254c6..5fc8ba7dbb 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -1,7 +1,6 @@ package com.hubspot.singularity.scheduler; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; @@ -18,32 +17,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.util.concurrent.AtomicDouble; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.inject.Inject; -import com.hubspot.mesos.Resources; -import com.hubspot.mesos.client.MesosClient; -import com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject; -import com.hubspot.mesos.json.MesosTaskMonitorObject; -import com.hubspot.singularity.ExtendedTaskState; -import com.hubspot.singularity.InvalidSingularityTaskIdException; import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityClusterUtilization; -import com.hubspot.singularity.SingularityDeleteResult; import com.hubspot.singularity.SingularityDeploy; import com.hubspot.singularity.SingularityPendingRequest; import com.hubspot.singularity.SingularityPendingRequest.PendingType; -import com.hubspot.singularity.SingularityRequestWithState; -import com.hubspot.singularity.SingularitySlave; import com.hubspot.singularity.SingularitySlaveUsage; -import com.hubspot.singularity.SingularityTask; import com.hubspot.singularity.SingularityTaskCleanup; -import com.hubspot.singularity.SingularityTaskCurrentUsage; -import com.hubspot.singularity.SingularityTaskHistoryUpdate; -import com.hubspot.singularity.SingularityTaskId; -import com.hubspot.singularity.SingularityTaskUsage; import com.hubspot.singularity.TaskCleanupType; import com.hubspot.singularity.async.AsyncSemaphore; import com.hubspot.singularity.async.CompletableFutures; @@ -52,18 +36,14 @@ import com.hubspot.singularity.data.RequestManager; import com.hubspot.singularity.data.TaskManager; import com.hubspot.singularity.data.UsageManager; -import com.hubspot.singularity.sentry.SingularityExceptionNotifier; public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { private static final Logger LOG = LoggerFactory.getLogger(SingularityUsagePoller.class); - private static final long DAY_IN_SECONDS = TimeUnit.DAYS.toSeconds(1); private final SingularityConfiguration configuration; - private final MesosClient mesosClient; private final UsageManager usageManager; private final SingularityUsageHelper usageHelper; - private final SingularityExceptionNotifier exceptionNotifier; private final RequestManager requestManager; private final DeployManager deployManager; private final TaskManager taskManager; @@ -76,8 +56,6 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { SingularityUsagePoller(SingularityConfiguration configuration, SingularityUsageHelper usageHelper, UsageManager usageManager, - MesosClient mesosClient, - SingularityExceptionNotifier exceptionNotifier, RequestManager requestManager, DeployManager deployManager, TaskManager taskManager) { @@ -85,9 +63,7 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { this.configuration = configuration; this.usageHelper = usageHelper; - this.mesosClient = mesosClient; this.usageManager = usageManager; - this.exceptionNotifier = exceptionNotifier; this.requestManager = requestManager; this.deployManager = deployManager; this.taskManager = taskManager; @@ -117,7 +93,7 @@ public void runActionOnPoll() { usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> { usageFutures.add(usageCollectionSemaphore.call(() -> CompletableFuture.supplyAsync(() -> { - return collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, + return usageHelper.collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable).get(); }, usageExecutor) )); @@ -146,214 +122,6 @@ public void runWithRequestLock(Runnable function, String requestId) { } } - public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations) { - return collectSlaveUsage( - slave, - now, - new ConcurrentHashMap<>(), - previousUtilizations, - new ConcurrentHashMap<>(), - new AtomicLong(), - new AtomicLong(), - new AtomicDouble(), - new AtomicDouble(), - new AtomicLong(), - new AtomicLong()); - } - - public Optional collectSlaveUsage( - SingularitySlave slave, - long now, - Map utilizationPerRequestId, - Map previousUtilizations, - Map> overLoadedHosts, - AtomicLong totalMemBytesUsed, - AtomicLong totalMemBytesAvailable, - AtomicDouble totalCpuUsed, - AtomicDouble totalCpuAvailable, - AtomicLong totalDiskBytesUsed, - AtomicLong totalDiskBytesAvailable) { - Optional memoryMbTotal = Optional.absent(); - Optional cpusTotal = Optional.absent(); - Optional diskMbTotal = Optional.absent(); - - long memoryMbReservedOnSlave = 0; - double cpuReservedOnSlave = 0; - long diskMbReservedOnSlave = 0; - - long memoryBytesUsedOnSlave = 0; - double cpusUsedOnSlave = 0; - long diskMbUsedOnSlave = 0; - - try { - List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost()); - MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); - double systemMemTotalBytes = 0; - double systemMemFreeBytes = 0; - double systemLoad1Min = 0; - double systemLoad5Min = 0; - double systemLoad15Min = 0; - double slaveDiskUsed = 0; - double slaveDiskTotal = 0; - double systemCpusTotal = 0; - if (slaveMetricsSnapshot != null) { - systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes(); - systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes(); - systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min(); - systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min(); - systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min(); - slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); - slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); - systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); - } - - double systemLoad; - switch (configuration.getMesosConfiguration().getScoreUsingSystemLoad()) { - case LOAD_1: - systemLoad = systemLoad1Min; - break; - case LOAD_15: - systemLoad = systemLoad15Min; - break; - case LOAD_5: - default: - systemLoad = systemLoad5Min; - break; - } - - boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0; - List possibleTasksToShuffle = new ArrayList<>(); - - for (MesosTaskMonitorObject taskUsage : allTaskUsage) { - String taskId = taskUsage.getSource(); - SingularityTaskId task; - try { - task = SingularityTaskId.valueOf(taskId); - } catch (InvalidSingularityTaskIdException e) { - LOG.error("Couldn't get SingularityTaskId for {}", taskUsage); - continue; - } - - SingularityTaskUsage latestUsage = getUsage(taskUsage); - List pastTaskUsages = usageManager.getTaskUsage(taskId); - - - clearOldUsage(taskId); - usageManager.saveSpecificTaskUsage(taskId, latestUsage); - - Optional maybeTask = taskManager.getTask(task); - Optional maybeResources = Optional.absent(); - if (maybeTask.isPresent()) { - maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources()); - if (maybeResources.isPresent()) { - Resources taskResources = maybeResources.get(); - double memoryMbReservedForTask = taskResources.getMemoryMb(); - double cpuReservedForTask = taskResources.getCpus(); - double diskMbReservedForTask = taskResources.getDiskMb(); - - memoryMbReservedOnSlave += memoryMbReservedForTask; - cpuReservedOnSlave += cpuReservedForTask; - diskMbReservedOnSlave += diskMbReservedForTask; - - runWithRequestLock(() -> updateRequestUtilization(utilizationPerRequestId, previousUtilizations, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask), task.getRequestId()); - } - } - memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes(); - diskMbUsedOnSlave += latestUsage.getDiskTotalBytes(); - - SingularityTaskCurrentUsage currentUsage = null; - if (pastTaskUsages.isEmpty()) { - Optional maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING); - if (maybeStartingUpdate.isPresent()) { - long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp()); - double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds); - currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes()); - usageManager.saveCurrentTaskUsage(taskId, currentUsage); - - cpusUsedOnSlave += usedCpusSinceStart; - } - } else { - SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1); - - double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp())); - - currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes()); - usageManager.saveCurrentTaskUsage(taskId, currentUsage); - cpusUsedOnSlave += taskCpusUsed; - } - - if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) { - if (isEligibleForShuffle(task)) { - Optional maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING); - if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) { - LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId); - } else { - if (maybeResources.isPresent()) { - possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage)); - } - } - } - } - } - - if (!slave.getResources().isPresent() || - !slave.getResources().get().getMemoryMegaBytes().isPresent() || - !slave.getResources().get().getNumCpus().isPresent()) { - LOG.debug("Could not find slave or resources for slave {}", slave.getId()); - } else { - memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue()); - cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue()); - diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); - } - - SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, - memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, - systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal); - - if (slaveOverloaded) { - overLoadedHosts.put(slaveUsage, possibleTasksToShuffle); - } - - List slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId()); - if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) { - usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0)); - } - - if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) { - totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed()); - totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed()); - totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed()); - - totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get()); - totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get()); - totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get()); - } - - LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); - usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); - return Optional.of(slaveUsage); - } catch (Throwable t) { - String message = String.format("Could not get slave usage for host %s", slave.getHost()); - LOG.error(message, t); - exceptionNotifier.notify(message, t); - } - return Optional.absent(); - } - - private boolean isEligibleForShuffle(SingularityTaskId task) { - Optional taskRunning = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_RUNNING); - - return ( - !configuration.getDoNotShuffleRequests().contains(task.getRequestId()) - && isLongRunning(task) - && ( - configuration.getMinutesBeforeNewTaskEligibleForShuffle() == 0 // Shuffle delay is disabled entirely - || (taskRunning.isPresent() && TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - taskRunning.get() - .getTimestamp()) >= configuration.getMinutesBeforeNewTaskEligibleForShuffle()) - ) - ); - } - private void shuffleTasksOnOverloadedHosts(Map> overLoadedHosts) { List shuffleCleanups = taskManager.getCleanupTasks() .stream() @@ -435,163 +203,6 @@ private double getSystemLoadForShuffle(SingularitySlaveUsage usage) { } } - private boolean isTaskAlreadyCleanedUpForShuffle(SingularityTaskHistoryUpdate taskHistoryUpdate) { - if (taskHistoryUpdate.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) { - return true; - } - for (SingularityTaskHistoryUpdate previous : taskHistoryUpdate.getPrevious()) { - if (previous.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) { - return true; - } - } - return false; - } - - private SingularityTaskUsage getUsage(MesosTaskMonitorObject taskUsage) { - return new SingularityTaskUsage( - taskUsage.getStatistics().getMemTotalBytes(), - taskUsage.getStatistics().getTimestamp(), - taskUsage.getStatistics().getCpusSystemTimeSecs() + taskUsage.getStatistics().getCpusUserTimeSecs(), - taskUsage.getStatistics().getDiskUsedBytes(), - taskUsage.getStatistics().getCpusNrPeriods(), - taskUsage.getStatistics().getCpusNrThrottled(), - taskUsage.getStatistics().getCpusThrottledTimeSecs()); - } - - private boolean isLongRunning(SingularityTaskId task) { - Optional request = requestManager.getRequest(task.getRequestId()); - if (request.isPresent()) { - return request.get().getRequest().getRequestType().isLongRunning(); - } - - LOG.warn("Couldn't find request id {} for task {}", task.getRequestId(), task.getId()); - return false; - } - - private void updateRequestUtilization(Map utilizationPerRequestId, - Map previousUtilizations, - List pastTaskUsages, - SingularityTaskUsage latestUsage, - SingularityTaskId task, - double memoryMbReservedForTask, - double cpuReservedForTask, - double diskMbReservedForTask) { - String requestId = task.getRequestId(); - RequestUtilization newRequestUtilization = utilizationPerRequestId.getOrDefault(requestId, new RequestUtilization(requestId, task.getDeployId())); - RequestUtilization previous = previousUtilizations.get(requestId); - // Take the previous request utilization into account to better measure 24 hour max/min values - if (previous != null) { - if (previous.getMaxMemTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMaxMemBytesUsed(previous.getMaxMemBytesUsed()); - newRequestUtilization.setMaxMemTimestamp(previous.getMaxMemTimestamp()); - } - if (previous.getMinMemTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMinMemBytesUsed(previous.getMinMemBytesUsed()); - newRequestUtilization.setMinMemTimestamp(previous.getMinMemTimestamp()); - } - if (previous.getMaxCpusTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMaxCpuUsed(previous.getMaxCpuUsed()); - newRequestUtilization.setMaxCpusTimestamp(previous.getMaxCpusTimestamp()); - } - if (previous.getMinCpusTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMinCpuUsed(previous.getMinCpuUsed()); - newRequestUtilization.setMinCpusTimestamp(previous.getMinCpusTimestamp()); - } - if (previous.getMaxDiskTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMaxDiskBytesUsed(previous.getMaxDiskBytesUsed()); - newRequestUtilization.setMaxDiskTimestamp(previous.getMaxDiskTimestamp()); - } - if (previous.getMinDiskTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMinDiskBytesUsed(previous.getMinDiskBytesUsed()); - newRequestUtilization.setMinDiskTimestamp(previous.getMinDiskTimestamp()); - } - if (previous.getMaxCpuThrottledTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMaxPercentCpuTimeThrottled(previous.getMaxPercentCpuTimeThrottled()); - newRequestUtilization.setMaxCpuThrottledTimestamp(previous.getMaxCpuThrottledTimestamp()); - } - if (previous.getMinCpuThrottledTimestamp() < DAY_IN_SECONDS) { - newRequestUtilization.setMinPercentCpuTimeThrottled(previous.getMinPercentCpuTimeThrottled()); - newRequestUtilization.setMinCpuThrottledTimestamp(previous.getMinCpuThrottledTimestamp()); - } - } - - List pastTaskUsagesCopy = getFullListOfTaskUsages(pastTaskUsages, latestUsage, task); - pastTaskUsagesCopy.sort(Comparator.comparingDouble(SingularityTaskUsage::getTimestamp)); - int numTasks = pastTaskUsagesCopy.size() - 1; // One usage is a fake 0 usage to calculate first cpu times - - int numCpuOverages = 0; - - for (int i = 0; i < numTasks; i++) { - SingularityTaskUsage olderUsage = pastTaskUsagesCopy.get(i); - SingularityTaskUsage newerUsage = pastTaskUsagesCopy.get(i + 1); - double cpusUsed = (newerUsage.getCpuSeconds() - olderUsage.getCpuSeconds()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp()); - double percentCpuTimeThrottled = (newerUsage.getCpusThrottledTimeSecs() - olderUsage.getCpusThrottledTimeSecs()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp()); - - if (cpusUsed > newRequestUtilization.getMaxCpuUsed()) { - newRequestUtilization.setMaxCpuUsed(cpusUsed); - newRequestUtilization.setMaxCpusTimestamp(newerUsage.getTimestamp()); - } - if (cpusUsed < newRequestUtilization.getMinCpuUsed()) { - newRequestUtilization.setMinCpuUsed(cpusUsed); - newRequestUtilization.setMinCpusTimestamp(newerUsage.getTimestamp()); - } - if (newerUsage.getMemoryTotalBytes() > newRequestUtilization.getMaxMemBytesUsed()) { - newRequestUtilization.setMaxMemBytesUsed(newerUsage.getMemoryTotalBytes()); - newRequestUtilization.setMaxMemTimestamp(newerUsage.getTimestamp()); - } - if (newerUsage.getMemoryTotalBytes() < newRequestUtilization.getMinMemBytesUsed()) { - newRequestUtilization.setMinMemBytesUsed(newerUsage.getMemoryTotalBytes()); - newRequestUtilization.setMinMemTimestamp(newerUsage.getTimestamp()); - } - if (newerUsage.getDiskTotalBytes() > newRequestUtilization.getMaxDiskBytesUsed()) { - newRequestUtilization.setMaxDiskBytesUsed(newerUsage.getDiskTotalBytes()); - newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp()); - } - if (newerUsage.getDiskTotalBytes() < newRequestUtilization.getMinDiskBytesUsed()) { - newRequestUtilization.setMinDiskBytesUsed(newerUsage.getDiskTotalBytes()); - newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp()); - } - if (percentCpuTimeThrottled > newRequestUtilization.getMaxPercentCpuTimeThrottled()) { - newRequestUtilization.setMaxPercentCpuTimeThrottled(percentCpuTimeThrottled); - newRequestUtilization.setMaxCpuThrottledTimestamp(newerUsage.getTimestamp()); - } - if (percentCpuTimeThrottled < newRequestUtilization.getMinPercentCpuTimeThrottled()) { - newRequestUtilization.setMinPercentCpuTimeThrottled(percentCpuTimeThrottled); - newRequestUtilization.setMinCpuThrottledTimestamp(newerUsage.getTimestamp()); - } - - if (cpusUsed > cpuReservedForTask) { - numCpuOverages++; - } - - newRequestUtilization - .addCpuUsed(cpusUsed) - .addMemBytesUsed(newerUsage.getMemoryTotalBytes()) - .addPercentCpuTimeThrottled(percentCpuTimeThrottled) - .addDiskBytesUsed(newerUsage.getDiskTotalBytes()) - .incrementTaskCount(); - } - - double cpuBurstRating = pastTaskUsagesCopy.size() > 0 ? numCpuOverages / (double) pastTaskUsagesCopy.size() : 1; - - newRequestUtilization - .addMemBytesReserved((long) (memoryMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks)) - .addCpuReserved(cpuReservedForTask * numTasks) - .addDiskBytesReserved((long) diskMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks) - .setCpuBurstRating(cpuBurstRating); - - utilizationPerRequestId.put(requestId, newRequestUtilization); - } - - private List getFullListOfTaskUsages(List pastTaskUsages, SingularityTaskUsage latestUsage, SingularityTaskId task) { - List pastTaskUsagesCopy = new ArrayList<>(); - pastTaskUsagesCopy.add(new SingularityTaskUsage(0, TimeUnit.MILLISECONDS.toSeconds(task.getStartedAt()), 0, 0, 0 , 0, 0)); // to calculate oldest cpu usage - pastTaskUsagesCopy.addAll(pastTaskUsages); - pastTaskUsagesCopy.add(latestUsage); - - return pastTaskUsagesCopy; - } - private SingularityClusterUtilization getClusterUtilization(Map utilizationPerRequestId, long totalMemBytesUsed, long totalMemBytesAvailable, @@ -701,41 +312,4 @@ private long getMin(long value) { return value == Long.MAX_VALUE ? 0 : value; } - @VisibleForTesting - void clearOldUsage(String taskId) { - usageManager.getTaskUsagePaths(taskId) - .stream() - .map(Double::parseDouble) - .skip(configuration.getNumUsageToKeep()) - .forEach((pathId) -> { - SingularityDeleteResult result = usageManager.deleteSpecificTaskUsage(taskId, pathId); - if (result.equals(SingularityDeleteResult.DIDNT_EXIST)) { - LOG.warn("Didn't delete taskUsage {} for taskId {}", pathId.toString(), taskId); - } - }); - } - - private static class TaskIdWithUsage { - private final SingularityTaskId taskId; - private final Resources requestedResources; - private final SingularityTaskCurrentUsage usage; - - TaskIdWithUsage(SingularityTaskId taskId, Resources requestedResources, SingularityTaskCurrentUsage usage) { - this.taskId = taskId; - this.requestedResources = requestedResources; - this.usage = usage; - } - - public SingularityTaskId getTaskId() { - return taskId; - } - - public Resources getRequestedResources() { - return requestedResources; - } - - public SingularityTaskCurrentUsage getUsage() { - return usage; - } - } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/TaskIdWithUsage.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/TaskIdWithUsage.java new file mode 100644 index 0000000000..e29669ed50 --- /dev/null +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/TaskIdWithUsage.java @@ -0,0 +1,29 @@ +package com.hubspot.singularity.scheduler; + +import com.hubspot.mesos.Resources; +import com.hubspot.singularity.SingularityTaskCurrentUsage; +import com.hubspot.singularity.SingularityTaskId; + +class TaskIdWithUsage { + private final SingularityTaskId taskId; + private final Resources requestedResources; + private final SingularityTaskCurrentUsage usage; + + TaskIdWithUsage(SingularityTaskId taskId, Resources requestedResources, SingularityTaskCurrentUsage usage) { + this.taskId = taskId; + this.requestedResources = requestedResources; + this.usage = usage; + } + + public SingularityTaskId getTaskId() { + return taskId; + } + + public Resources getRequestedResources() { + return requestedResources; + } + + public SingularityTaskCurrentUsage getUsage() { + return usage; + } +} diff --git a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java index e41b60c906..c7b0658e1a 100644 --- a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java +++ b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java @@ -27,6 +27,9 @@ public class SingularityUsageTest extends SingularitySchedulerTestBase { + @Inject + protected SingularityUsageHelper usageHelper; + @Inject protected SingularityUsagePoller usagePoller; @@ -639,7 +642,7 @@ private void saveTaskUsage(String taskId, long... times) { } private void clearUsages(String taskId) { - usagePoller.clearOldUsage(taskId); + usageHelper.clearOldUsage(taskId); } private void testUtilization(SingularityClusterUtilization utilization, From 6c02f60958e09d64d90754357aaebc5a384a0bbd Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 31 Jul 2018 15:52:38 -0400 Subject: [PATCH 08/26] Fix tests --- .../singularity/mesos/SingularityMesosOfferScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 4c13699af4..e2f8338d4c 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -190,7 +190,7 @@ public Collection checkOffers(final Collection of String slaveId = offerHolder.getSlaveId(); Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); - if (taskManager.getActiveTasks().stream() + if (maybeSlaveUsage.isPresent() && taskManager.getActiveTasks().stream() .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { Optional maybeSlave = slaveManager.getSlave(slaveId); From fe45d81434cabc8fdc168ebb6854c93d8b629f35 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Wed, 1 Aug 2018 14:35:01 -0400 Subject: [PATCH 09/26] Add timeout for slave usage checking --- .../com/hubspot/mesos/client/MesosClient.java | 2 +- .../mesos/client/SingularityMesosClient.java | 20 ++++++++++++------- .../client/SingularityMesosClientModule.java | 11 +++++++--- .../mesos/SingularityMesosOfferScheduler.java | 3 ++- .../scheduler/SingularityUsageHelper.java | 10 ++++++---- .../scheduler/SingularityUsagePoller.java | 10 +++++----- 6 files changed, 35 insertions(+), 21 deletions(-) diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java index 5102ad851c..8e3b314b03 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java @@ -41,6 +41,6 @@ public MesosClientException(String message, Throwable cause) { public MesosSlaveStateObject getSlaveState(String uri); - public List getSlaveResourceUsage(String hostname); + public List getSlaveResourceUsage(String hostname, boolean useShortTimeout); } diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java index f224fa3bbf..ceb28d27e2 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java @@ -11,6 +11,7 @@ import com.google.inject.name.Named; import com.hubspot.horizon.HttpClient; import com.hubspot.horizon.HttpRequest; +import com.hubspot.horizon.HttpRequest.Options; import com.hubspot.horizon.HttpResponse; import com.hubspot.mesos.JavaUtils; import com.hubspot.mesos.json.MesosMasterMetricsSnapshotObject; @@ -22,7 +23,8 @@ @Singleton public class SingularityMesosClient implements MesosClient { - public static final String HTTP_CLIENT_NAME = "mesos.http.client"; + public static final String DEFAULT_HTTP_CLIENT_NAME = "mesos.http.client"; + public static final String SHORT_TIMEOUT_HTTP_CLIENT_NAME = "mesos.http.client.short.timeout"; private static final Logger LOG = LoggerFactory.getLogger(SingularityMesosClient.class); @@ -35,10 +37,13 @@ public class SingularityMesosClient implements MesosClient { private static final TypeReference> TASK_MONITOR_TYPE_REFERENCE = new TypeReference>() {}; private final HttpClient httpClient; + private final HttpClient shortTimeoutHttpClient; @Inject - public SingularityMesosClient(@Named(HTTP_CLIENT_NAME) HttpClient httpClient) { + public SingularityMesosClient(@Named(DEFAULT_HTTP_CLIENT_NAME) HttpClient httpClient, + @Named(SHORT_TIMEOUT_HTTP_CLIENT_NAME) HttpClient shortTimeoutHttpClient) { this.httpClient = httpClient; + this.shortTimeoutHttpClient = shortTimeoutHttpClient; } @Override @@ -51,7 +56,8 @@ public String getMasterMetricsSnapshotUri(String hostnameAndPort) { return String.format(MESOS_MASTER_METRICS_SNAPSHOT_URL, hostnameAndPort); } - private HttpResponse getFromMesos(String uri) { + private HttpResponse getFromMesos(String uri, boolean useShortTimeout) { + HttpClient currentHttpClient = useShortTimeout ? shortTimeoutHttpClient : httpClient; HttpResponse response = null; final long start = System.currentTimeMillis(); @@ -59,7 +65,7 @@ private HttpResponse getFromMesos(String uri) { LOG.debug("Fetching {} from mesos", uri); try { - response = httpClient.execute(HttpRequest.newBuilder().setUrl(uri).build()); + response = currentHttpClient.execute(HttpRequest.newBuilder().setUrl(uri).build(), new Options()); LOG.debug("Response {} - {} after {}", response.getStatusCode(), uri, JavaUtils.duration(start)); } catch (Exception e) { @@ -74,7 +80,7 @@ private HttpResponse getFromMesos(String uri) { } private T getFromMesos(String uri, Class clazz) { - HttpResponse response = getFromMesos(uri); + HttpResponse response = getFromMesos(uri, false); try { return response.getAs(clazz); @@ -109,10 +115,10 @@ public MesosSlaveStateObject getSlaveState(String uri) { } @Override - public List getSlaveResourceUsage(String hostname) { + public List getSlaveResourceUsage(String hostname, boolean useShortTimeout) { final String uri = String.format(MESOS_SLAVE_STATISTICS_URL, hostname); - HttpResponse response = getFromMesos(uri); + HttpResponse response = getFromMesos(uri, useShortTimeout); try { return response.getAs(TASK_MONITOR_TYPE_REFERENCE); diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClientModule.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClientModule.java index 0671fe57cc..690430f5d0 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClientModule.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClientModule.java @@ -6,21 +6,26 @@ import com.google.inject.name.Names; import com.hubspot.horizon.HttpClient; import com.hubspot.horizon.HttpConfig; +import com.hubspot.horizon.HttpConfig.Builder; import com.hubspot.horizon.ning.NingHttpClient; import com.hubspot.mesos.JavaUtils; public class SingularityMesosClientModule extends AbstractModule { public static final String MESOS_CLIENT_OBJECT_MAPPER = "singularity.mesos.client.object.mapper"; + private static final int MESOS_CLIENT_HTTP_SHORT_TIMEOUT_SECONDS = 5; @Override protected void configure() { ObjectMapper objectMapper = JavaUtils.newObjectMapper(); - HttpConfig httpConfig = HttpConfig.newBuilder().setObjectMapper(objectMapper).build(); - HttpClient httpClient = new NingHttpClient(httpConfig); + Builder httpConfigBuilder = HttpConfig.newBuilder().setObjectMapper(objectMapper); bind(ObjectMapper.class).annotatedWith(Names.named(MESOS_CLIENT_OBJECT_MAPPER)).toInstance(objectMapper); - bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.HTTP_CLIENT_NAME)).toInstance(httpClient); + bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.DEFAULT_HTTP_CLIENT_NAME)) + .toInstance(new NingHttpClient(httpConfigBuilder.build())); + + bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.SHORT_TIMEOUT_HTTP_CLIENT_NAME)) + .toInstance(new NingHttpClient(httpConfigBuilder.setRequestTimeoutSeconds(MESOS_CLIENT_HTTP_SHORT_TIMEOUT_SECONDS).build())); bind(MesosClient.class).to(SingularityMesosClient.class).in(Scopes.SINGLETON); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index e2f8338d4c..3d38f8e7fb 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -200,7 +200,8 @@ public Collection checkOffers(final Collection of new SingularitySlaveUsageWithId(usageHelper.collectSlaveUsage( maybeSlave.get(), System.currentTimeMillis(), - usageManager.getRequestUtilizations()).get(), slaveId)); + usageManager.getRequestUtilizations(), + true).get(), slaveId)); } } }, offerScoringExecutor))); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 362d8a4177..a7c746027f 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -106,7 +106,7 @@ public List getSlavesToTrackUsageFor() { - public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations) { + public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations, boolean useShortTimeout) { return collectSlaveUsage( slave, now, @@ -118,7 +118,8 @@ public Optional collectSlaveUsage(SingularitySlave slave, new AtomicDouble(), new AtomicDouble(), new AtomicLong(), - new AtomicLong()); + new AtomicLong(), + useShortTimeout); } public Optional collectSlaveUsage( @@ -132,7 +133,8 @@ public Optional collectSlaveUsage( AtomicDouble totalCpuUsed, AtomicDouble totalCpuAvailable, AtomicLong totalDiskBytesUsed, - AtomicLong totalDiskBytesAvailable) { + AtomicLong totalDiskBytesAvailable, + boolean useShortTimeout) { Optional memoryMbTotal = Optional.absent(); Optional cpusTotal = Optional.absent(); Optional diskMbTotal = Optional.absent(); @@ -146,7 +148,7 @@ public Optional collectSlaveUsage( long diskMbUsedOnSlave = 0; try { - List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost()); + List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost(), useShortTimeout); MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); double systemMemTotalBytes = 0; double systemMemFreeBytes = 0; diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 5fc8ba7dbb..77636e04a8 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -48,7 +48,7 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { private final DeployManager deployManager; private final TaskManager taskManager; - private final AsyncSemaphore usageCollectionSemaphore; + private final AsyncSemaphore usageCollectionSemaphore; private final ExecutorService usageExecutor; private final ConcurrentHashMap requestLocks; @@ -88,13 +88,13 @@ public void runActionOnPoll() { Map> overLoadedHosts = new ConcurrentHashMap<>(); - List> usageFutures = new ArrayList<>(); + List> usageFutures = new ArrayList<>(); usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> { usageFutures.add(usageCollectionSemaphore.call(() -> - CompletableFuture.supplyAsync(() -> { - return usageHelper.collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, - totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable).get(); + CompletableFuture.runAsync(() -> { + usageHelper.collectSlaveUsage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable, + totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable, false); }, usageExecutor) )); }); From 1414edf0b09f5913986af04129ae34cfc55288f1 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Wed, 1 Aug 2018 14:42:53 -0400 Subject: [PATCH 10/26] Build fix --- .../src/main/java/com/hubspot/mesos/client/MesosClient.java | 4 ++++ .../hubspot/singularity/scheduler/SingularityUsageHelper.java | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java index 8e3b314b03..1ba75881eb 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java @@ -41,6 +41,10 @@ public MesosClientException(String message, Throwable cause) { public MesosSlaveStateObject getSlaveState(String uri); + default List getSlaveResourceUsage(String hostname) { + return getSlaveResourceUsage(hostname, false); + } + public List getSlaveResourceUsage(String hostname, boolean useShortTimeout); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index a7c746027f..6ae0cd9efb 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -103,9 +103,6 @@ public List getSlavesToTrackUsageFor() { return slavesToTrack; } - - - public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations, boolean useShortTimeout) { return collectSlaveUsage( slave, From 46506393a873406ef061575ea9328c95534bd142 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Wed, 1 Aug 2018 14:46:02 -0400 Subject: [PATCH 11/26] Build fix --- .../com/hubspot/singularity/scheduler/TestingMesosClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java index 46cf68c82a..56f17f924c 100644 --- a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java +++ b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java @@ -66,7 +66,7 @@ public MesosSlaveStateObject getSlaveState(String uri) { } @Override - public List getSlaveResourceUsage(String hostname) { + public List getSlaveResourceUsage(String hostname, boolean useShortTimeout) { return slaveResourceUsage.getOrDefault(hostname, Collections.emptyList()); } From 890504bb9340ab1a0caf0f93cae998945b6e9d58 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Wed, 1 Aug 2018 15:33:34 -0400 Subject: [PATCH 12/26] Fix failed data refresh --- .../mesos/SingularityMesosOfferScheduler.java | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 3d38f8e7fb..c62d3141a9 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -195,13 +195,17 @@ public Collection checkOffers(final Collection of && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { Optional maybeSlave = slaveManager.getSlave(slaveId); if (maybeSlave.isPresent()) { - currentSlaveUsages.put( - slaveId, - new SingularitySlaveUsageWithId(usageHelper.collectSlaveUsage( - maybeSlave.get(), - System.currentTimeMillis(), - usageManager.getRequestUtilizations(), - true).get(), slaveId)); + Optional usage = usageHelper.collectSlaveUsage( + maybeSlave.get(), + System.currentTimeMillis(), + usageManager.getRequestUtilizations(), + true); + if (usage.isPresent()) { + currentSlaveUsages.put(slaveId, new SingularitySlaveUsageWithId(usage.get(), slaveId)); + } else { + LOG.warn("Failed to refresh stale slave usage data for {}. Will not schedule tasks right now.", maybeSlave.get().getName()); + currentSlaveUsages.remove(slaveId); + } } } }, offerScoringExecutor))); From 5d7de8052ba40de0c2f169259723e6fd1175d57e Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 2 Aug 2018 09:08:06 -0400 Subject: [PATCH 13/26] Make fewer zk calls for usage fetching --- .../com/hubspot/singularity/data/UsageManager.java | 14 ++++++++++++++ .../mesos/SingularityMesosOfferScheduler.java | 14 ++++++++++++-- .../scheduler/SingularityUsageHelper.java | 5 ++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java b/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java index 23d489626b..8f29961927 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -192,6 +193,19 @@ public Map getRequestUtilizations() { )); } + public Map getRequestUtilizations(Set requestIds) { + List paths = new ArrayList<>(); + for (String requestId : requestIds) { + paths.add(getRequestPath(requestId)); + } + return getAsync("/usage/requests", paths, requestUtilizationTranscoder) + .stream() + .collect(Collectors.toMap( + RequestUtilization::getRequestId, + Function.identity() + )); + } + public Optional getRequestUtilization(String requestId) { return getData(getRequestPath(requestId), requestUtilizationTranscoder); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index c62d3141a9..5ed1de50cf 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -3,8 +3,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; @@ -149,6 +151,7 @@ public Collection checkOffers(final Collection of final List sortedTaskRequestHolders = getSortedDueTaskRequests(); final int numDueTasks = sortedTaskRequestHolders.size(); + Set relevantRequestIds = new HashSet<>(); final Map offerHolders = offers.stream() .collect(Collectors.groupingBy((o) -> o.getAgentId().getValue())) @@ -166,6 +169,13 @@ public Collection checkOffers(final Collection of slaveAndRackHelper.getTextAttributes(offersList.get(0)), slaveAndRackHelper.getReservedSlaveAttributes(offersList.get(0))); }) + .peek((offerHolder) -> { + taskManager.getActiveTaskIds().forEach((t) -> { + if (t.getSanitizedHost().equals(offerHolder.getSanitizedHost())) { + relevantRequestIds.add(t.getRequestId()); + } + }); + }) .collect(Collectors.toMap(SingularityOfferHolder::getSlaveId, Function.identity())); if (sortedTaskRequestHolders.isEmpty()) { @@ -173,7 +183,7 @@ public Collection checkOffers(final Collection of } final AtomicInteger tasksScheduled = new AtomicInteger(0); - Map requestUtilizations = usageManager.getRequestUtilizations(); + Map requestUtilizations = usageManager.getRequestUtilizations(relevantRequestIds); List activeTaskIds = taskManager.getActiveTaskIds(); Map currentSlaveUsages = usageManager.getCurrentSlaveUsages( @@ -198,7 +208,7 @@ public Collection checkOffers(final Collection of Optional usage = usageHelper.collectSlaveUsage( maybeSlave.get(), System.currentTimeMillis(), - usageManager.getRequestUtilizations(), + requestUtilizations, true); if (usage.isPresent()) { currentSlaveUsages.put(slaveId, new SingularitySlaveUsageWithId(usage.get(), slaveId)); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 6ae0cd9efb..95ab4d69f5 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -214,7 +214,7 @@ public Optional collectSlaveUsage( cpuReservedOnSlave += cpuReservedForTask; diskMbReservedOnSlave += diskMbReservedForTask; - runWithRequestLock(() -> updateRequestUtilization(utilizationPerRequestId, previousUtilizations, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask), task.getRequestId()); + runWithRequestLock(() -> updateRequestUtilization(utilizationPerRequestId, previousUtilizations.get(maybeTask.get().getTaskRequest().getRequest().getId()), pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask), task.getRequestId()); } } memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes(); @@ -357,7 +357,7 @@ private boolean isTaskAlreadyCleanedUpForShuffle(SingularityTaskHistoryUpdate ta } private void updateRequestUtilization(Map utilizationPerRequestId, - Map previousUtilizations, + RequestUtilization previous, List pastTaskUsages, SingularityTaskUsage latestUsage, SingularityTaskId task, @@ -366,7 +366,6 @@ private void updateRequestUtilization(Map utilizatio double diskMbReservedForTask) { String requestId = task.getRequestId(); RequestUtilization newRequestUtilization = utilizationPerRequestId.getOrDefault(requestId, new RequestUtilization(requestId, task.getDeployId())); - RequestUtilization previous = previousUtilizations.get(requestId); // Take the previous request utilization into account to better measure 24 hour max/min values if (previous != null) { if (previous.getMaxMemTimestamp() < DAY_IN_SECONDS) { From 572407e68949cf2645721d07c6edce91c9d0faa6 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Mon, 6 Aug 2018 15:50:19 -0400 Subject: [PATCH 14/26] Add new method for slave usage --- .../mesos/SingularityMesosOfferScheduler.java | 9 +- .../scheduler/SingularityUsageHelper.java | 107 +++++++++++++++--- 2 files changed, 97 insertions(+), 19 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index c62d3141a9..d5b204954a 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -190,15 +190,14 @@ public Collection checkOffers(final Collection of String slaveId = offerHolder.getSlaveId(); Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); - if (maybeSlaveUsage.isPresent() && taskManager.getActiveTasks().stream() - .anyMatch(t -> t.getTaskRequest().getDeploy().getTimestamp().or(System.currentTimeMillis()) > maybeSlaveUsage.get().getTimestamp() - && t.getMesosTask().getSlaveId().getValue().equals(slaveId))) { + if (maybeSlaveUsage.isPresent() && taskManager.getActiveTaskIds().stream() + .anyMatch(t -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() + && t.getSanitizedHost().equals(offerHolder.getSanitizedHost()))) { Optional maybeSlave = slaveManager.getSlave(slaveId); if (maybeSlave.isPresent()) { - Optional usage = usageHelper.collectSlaveUsage( + Optional usage = usageHelper.collectSlaveUsageSimple( maybeSlave.get(), System.currentTimeMillis(), - usageManager.getRequestUtilizations(), true); if (usage.isPresent()) { currentSlaveUsages.put(slaveId, new SingularitySlaveUsageWithId(usage.get(), slaveId)); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 6ae0cd9efb..48dd4613a5 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -103,20 +103,99 @@ public List getSlavesToTrackUsageFor() { return slavesToTrack; } - public Optional collectSlaveUsage(SingularitySlave slave, long now, Map previousUtilizations, boolean useShortTimeout) { - return collectSlaveUsage( - slave, - now, - new ConcurrentHashMap<>(), - previousUtilizations, - new ConcurrentHashMap<>(), - new AtomicLong(), - new AtomicLong(), - new AtomicDouble(), - new AtomicDouble(), - new AtomicLong(), - new AtomicLong(), - useShortTimeout); + public Optional collectSlaveUsageSimple( + SingularitySlave slave, + long now, + boolean useShortTimeout) { + Optional memoryMbTotal = Optional.absent(); + Optional cpusTotal = Optional.absent(); + Optional diskMbTotal = Optional.absent(); + + try { + List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost(), useShortTimeout); + MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); + + long memoryMbReservedOnSlave = 0L; + double cpuReservedOnSlave = 0.0; + long diskMbReservedOnSlave = 0L; + + long memoryBytesUsedOnSlave = 0; + double cpusUsedOnSlave = 0; + long diskMbUsedOnSlave = 0; + + double systemMemTotalBytes = 0; + double systemMemFreeBytes = 0; + double systemLoad1Min = 0; + double systemLoad5Min = 0; + double systemLoad15Min = 0; + double slaveDiskUsed = 0; + double slaveDiskTotal = 0; + double systemCpusTotal = 0; + if (slaveMetricsSnapshot != null) { + memoryMbReservedOnSlave = (long) slaveMetricsSnapshot.getSlaveMemUsed(); + cpuReservedOnSlave = slaveMetricsSnapshot.getSlaveCpusUsed(); + diskMbReservedOnSlave = (long) slaveMetricsSnapshot.getSlaveDiskUsed(); + systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes(); + systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes(); + systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min(); + systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min(); + systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min(); + slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); + slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); + systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); + } + + for (MesosTaskMonitorObject taskUsage : allTaskUsage) { + String taskId = taskUsage.getSource(); + SingularityTaskId task; + try { + task = SingularityTaskId.valueOf(taskId); + } catch (InvalidSingularityTaskIdException e) { + LOG.error("Couldn't get SingularityTaskId for {}", taskUsage); + continue; + } + + SingularityTaskUsage latestUsage = getUsage(taskUsage); + memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes(); + diskMbUsedOnSlave += latestUsage.getDiskTotalBytes(); + + List pastTaskUsages = usageManager.getTaskUsage(taskId); + if (pastTaskUsages.isEmpty()) { + Optional maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING); + if (maybeStartingUpdate.isPresent()) { + long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp()); + cpusUsedOnSlave += latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds); + } + } else { + SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1); + cpusUsedOnSlave += ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp())); + } + } + + if (!slave.getResources().isPresent() || + !slave.getResources().get().getMemoryMegaBytes().isPresent() || + !slave.getResources().get().getNumCpus().isPresent()) { + LOG.debug("Could not find slave or resources for slave {}", slave.getId()); + } else { + memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue()); + cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue()); + diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); + } + + // system to used, slave to reserved + SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, + memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, + systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal); + + LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); + usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); + return Optional.of(slaveUsage); + } catch (Throwable t) { + String message = String.format("Could not get slave usage for host %s", slave.getHost()); + LOG.error(message, t); + exceptionNotifier.notify(message, t); + } + return Optional.absent(); } public Optional collectSlaveUsage( From d527799c28147754c70c3317c9a68221e4718a6d Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Mon, 6 Aug 2018 16:20:37 -0400 Subject: [PATCH 15/26] rm comment --- .../hubspot/singularity/scheduler/SingularityUsageHelper.java | 1 - 1 file changed, 1 deletion(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index a2ed7aeb16..d5e28e5244 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -182,7 +182,6 @@ public Optional collectSlaveUsageSimple( diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); } - // system to used, slave to reserved SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal); From d055ff0f363020b52d9f03614b24f487f2e36043 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Mon, 6 Aug 2018 16:35:59 -0400 Subject: [PATCH 16/26] longs to doubles --- .../singularity/SingularitySlaveUsage.java | 24 +++++++++---------- .../scheduler/SingularityUsageHelper.java | 16 ++++++------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/SingularityBase/src/main/java/com/hubspot/singularity/SingularitySlaveUsage.java b/SingularityBase/src/main/java/com/hubspot/singularity/SingularitySlaveUsage.java index c694033592..bbf414b8b5 100644 --- a/SingularityBase/src/main/java/com/hubspot/singularity/SingularitySlaveUsage.java +++ b/SingularityBase/src/main/java/com/hubspot/singularity/SingularitySlaveUsage.java @@ -16,11 +16,11 @@ public class SingularitySlaveUsage { private final double cpusUsed; private final double cpusReserved; private final Optional cpusTotal; - private final long memoryBytesUsed; - private final long memoryMbReserved; + private final double memoryBytesUsed; + private final double memoryMbReserved; private final Optional memoryMbTotal; - private final long diskBytesUsed; - private final long diskMbReserved; + private final double diskBytesUsed; + private final double diskMbReserved; private final Optional diskMbTotal; private final int numTasks; private final long timestamp; @@ -37,11 +37,11 @@ public class SingularitySlaveUsage { public SingularitySlaveUsage(@JsonProperty("cpusUsed") double cpusUsed, @JsonProperty("cpusReserved") double cpusReserved, @JsonProperty("cpusTotal") Optional cpusTotal, - @JsonProperty("memoryBytesUsed") long memoryBytesUsed, - @JsonProperty("memoryMbReserved") long memoryMbReserved, + @JsonProperty("memoryBytesUsed") double memoryBytesUsed, + @JsonProperty("memoryMbReserved") double memoryMbReserved, @JsonProperty("memoryMbTotal") Optional memoryMbTotal, - @JsonProperty("diskBytesUsed") long diskBytesUsed, - @JsonProperty("diskMbReserved") long diskMbReserved, + @JsonProperty("diskBytesUsed") double diskBytesUsed, + @JsonProperty("diskMbReserved") double diskMbReserved, @JsonProperty("diskMbTotal") Optional diskMbTotal, @JsonProperty("numTasks") int numTasks, @JsonProperty("timestamp") long timestamp, @@ -94,12 +94,12 @@ public Optional getCpusTotal() { } @Schema(description = "Total memory used by tasks in bytes") - public long getMemoryBytesUsed() { + public double getMemoryBytesUsed() { return memoryBytesUsed; } @Schema(description = "Total memory reserved by tasks in MB") - public long getMemoryMbReserved() { + public double getMemoryMbReserved() { return memoryMbReserved; } @@ -122,12 +122,12 @@ public Optional getMemoryBytesTotal() { } @Schema(description = "Total disk currently used by tasks in bytes") - public long getDiskBytesUsed() { + public double getDiskBytesUsed() { return diskBytesUsed; } @Schema(description = "Total disk currently reserved by tasks in MB") - public long getDiskMbReserved() { + public double getDiskMbReserved() { return diskMbReserved; } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index d5e28e5244..21d2cbaac9 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -115,13 +115,13 @@ public Optional collectSlaveUsageSimple( List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost(), useShortTimeout); MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); - long memoryMbReservedOnSlave = 0L; + double memoryMbReservedOnSlave = 0L; double cpuReservedOnSlave = 0.0; - long diskMbReservedOnSlave = 0L; + double diskMbReservedOnSlave = 0L; - long memoryBytesUsedOnSlave = 0; + double memoryBytesUsedOnSlave = 0; double cpusUsedOnSlave = 0; - long diskMbUsedOnSlave = 0; + double diskMbUsedOnSlave = 0; double systemMemTotalBytes = 0; double systemMemFreeBytes = 0; @@ -132,9 +132,9 @@ public Optional collectSlaveUsageSimple( double slaveDiskTotal = 0; double systemCpusTotal = 0; if (slaveMetricsSnapshot != null) { - memoryMbReservedOnSlave = (long) slaveMetricsSnapshot.getSlaveMemUsed(); + memoryMbReservedOnSlave = slaveMetricsSnapshot.getSlaveMemUsed(); cpuReservedOnSlave = slaveMetricsSnapshot.getSlaveCpusUsed(); - diskMbReservedOnSlave = (long) slaveMetricsSnapshot.getSlaveDiskUsed(); + diskMbReservedOnSlave = slaveMetricsSnapshot.getSlaveDiskUsed(); systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes(); systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes(); systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min(); @@ -357,9 +357,9 @@ public Optional collectSlaveUsage( } if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) { - totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed()); + totalMemBytesUsed.getAndAdd((long) slaveUsage.getMemoryBytesUsed()); totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed()); - totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed()); + totalDiskBytesUsed.getAndAdd((long) slaveUsage.getDiskBytesUsed()); totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get()); totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get()); From 01275177c1e247f697b443f9d582beef8cee0e39 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Mon, 6 Aug 2018 16:43:33 -0400 Subject: [PATCH 17/26] Add test tolerances --- .../hubspot/singularity/scheduler/SingularityUsageTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java index c7b0658e1a..d5163c67b8 100644 --- a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java +++ b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularityUsageTest.java @@ -71,7 +71,7 @@ public void testUsagePollerSimple() { Assert.assertEquals(slaves.get(0), slaveId); Assert.assertEquals(0, usageManager.getSlaveUsage(slaveId).get(0).getCpusUsed(), 0); - Assert.assertEquals(100, usageManager.getSlaveUsage(slaveId).get(0).getMemoryBytesUsed()); + Assert.assertEquals(100, usageManager.getSlaveUsage(slaveId).get(0).getMemoryBytesUsed(), 0); SingularityTaskUsage first = usageManager.getTaskUsage(firstTask.getTaskId().getId()).get(0); @@ -106,7 +106,7 @@ public void testUsageCleaner() { Assert.assertEquals(2, usageManager.getTasksWithUsage().size()); Assert.assertEquals(1, usageManager.getSlavesWithUsage().size()); - Assert.assertEquals(1100, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed()); + Assert.assertEquals(1100, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed(), 0); // kill task one statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_KILLED); @@ -195,7 +195,7 @@ public void testUsagePoller() throws InterruptedException { Assert.assertEquals(1149, usageManager.getSlaveUsage(slaveId).get(1).getMemoryBytesUsed(), 0); Assert.assertEquals(slaveId, usageManager.getAllCurrentSlaveUsage().get(0).getSlaveId()); - Assert.assertEquals(1149, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed()); + Assert.assertEquals(1149, usageManager.getAllCurrentSlaveUsage().get(0).getMemoryBytesUsed(), 0); List taskCurrentUsages = usageManager.getTaskCurrentUsages(taskManager.getActiveTaskIds()); From f4de9ee55719d0708aa32de24f1f497a2ac1556f Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 7 Aug 2018 15:05:13 -0400 Subject: [PATCH 18/26] Remove more zk calls --- .../scheduler/SingularityUsageHelper.java | 40 +++++-------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 21d2cbaac9..ec4169deb5 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -143,33 +143,10 @@ public Optional collectSlaveUsageSimple( slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); - } - - for (MesosTaskMonitorObject taskUsage : allTaskUsage) { - String taskId = taskUsage.getSource(); - SingularityTaskId task; - try { - task = SingularityTaskId.valueOf(taskId); - } catch (InvalidSingularityTaskIdException e) { - LOG.error("Couldn't get SingularityTaskId for {}", taskUsage); - continue; - } - SingularityTaskUsage latestUsage = getUsage(taskUsage); - memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes(); - diskMbUsedOnSlave += latestUsage.getDiskTotalBytes(); - - List pastTaskUsages = usageManager.getTaskUsage(taskId); - if (pastTaskUsages.isEmpty()) { - Optional maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING); - if (maybeStartingUpdate.isPresent()) { - long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp()); - cpusUsedOnSlave += latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds); - } - } else { - SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1); - cpusUsedOnSlave += ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp())); - } + memoryBytesUsedOnSlave = slaveMetricsSnapshot.getSlaveMemUsed(); + diskMbUsedOnSlave = slaveMetricsSnapshot.getSlaveDiskUsed(); + cpusUsedOnSlave = slaveMetricsSnapshot.getSlaveCpusUsed(); } if (!slave.getResources().isPresent() || @@ -182,9 +159,14 @@ public Optional collectSlaveUsageSimple( diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); } - SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, - memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, - systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal); + SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage( + cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, + memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, + diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, + allTaskUsage.size(), now, + systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, + systemLoad1Min, systemLoad5Min, systemLoad15Min, + slaveDiskUsed, slaveDiskTotal); LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); From df9c3a0c79f92a519abf54fe545ef7a4ee3961d3 Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 7 Aug 2018 15:07:07 -0400 Subject: [PATCH 19/26] Condense duplicate variables --- .../scheduler/SingularityUsageHelper.java | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index ec4169deb5..f21f2a4a5a 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -119,10 +119,6 @@ public Optional collectSlaveUsageSimple( double cpuReservedOnSlave = 0.0; double diskMbReservedOnSlave = 0L; - double memoryBytesUsedOnSlave = 0; - double cpusUsedOnSlave = 0; - double diskMbUsedOnSlave = 0; - double systemMemTotalBytes = 0; double systemMemFreeBytes = 0; double systemLoad1Min = 0; @@ -143,10 +139,6 @@ public Optional collectSlaveUsageSimple( slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); - - memoryBytesUsedOnSlave = slaveMetricsSnapshot.getSlaveMemUsed(); - diskMbUsedOnSlave = slaveMetricsSnapshot.getSlaveDiskUsed(); - cpusUsedOnSlave = slaveMetricsSnapshot.getSlaveCpusUsed(); } if (!slave.getResources().isPresent() || @@ -160,9 +152,9 @@ public Optional collectSlaveUsageSimple( } SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage( - cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, - memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, - diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, + cpuReservedOnSlave, cpuReservedOnSlave, cpusTotal, + memoryMbReservedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, + memoryMbReservedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, From b1fbeda42cdc9848b794f740a53ecfaf93e739cf Mon Sep 17 00:00:00 2001 From: Paul Schoenfelder Date: Tue, 7 Aug 2018 15:07:47 -0400 Subject: [PATCH 20/26] Fix typo --- .../hubspot/singularity/scheduler/SingularityUsageHelper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index f21f2a4a5a..5b23b89503 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -154,7 +154,7 @@ public Optional collectSlaveUsageSimple( SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage( cpuReservedOnSlave, cpuReservedOnSlave, cpusTotal, memoryMbReservedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, - memoryMbReservedOnSlave, diskMbReservedOnSlave, diskMbTotal, + diskMbReservedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, From 5ec231bfd249bd59af1bcd14d39e90af95a069aa Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Wed, 8 Aug 2018 09:45:49 -0400 Subject: [PATCH 21/26] Skip hosts which do not have valid metrics during offer processing --- .../config/SingularityConfiguration.java | 10 +++ .../mesos/SingularityMesosOfferScheduler.java | 23 ++---- .../scheduler/SingularityUsageHelper.java | 72 +------------------ 3 files changed, 17 insertions(+), 88 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java index c65aa7062d..d3a5aa5218 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java @@ -378,6 +378,8 @@ public class SingularityConfiguration extends Configuration { private long maxSlaveUsageMetricAgeMs = 30000; + private boolean waitForNewSlaveMetricsBeforeScheduling = false; + public long getAskDriverToKillTasksAgainAfterMillis() { return askDriverToKillTasksAgainAfterMillis; } @@ -1603,4 +1605,12 @@ public long getMaxSlaveUsageMetricAgeMs() { public void setMaxSlaveUsageMetricAgeMs(long maxSlaveUsageMetricAgeMs) { this.maxSlaveUsageMetricAgeMs = maxSlaveUsageMetricAgeMs; } + + public boolean isWaitForNewSlaveMetricsBeforeScheduling() { + return waitForNewSlaveMetricsBeforeScheduling; + } + + public void setWaitForNewSlaveMetricsBeforeScheduling(boolean waitForNewSlaveMetricsBeforeScheduling) { + this.waitForNewSlaveMetricsBeforeScheduling = waitForNewSlaveMetricsBeforeScheduling; + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 745fa6b15c..03280c661c 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -31,7 +31,6 @@ import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityDeployStatistics; import com.hubspot.singularity.SingularityPendingTaskId; -import com.hubspot.singularity.SingularitySlave; import com.hubspot.singularity.SingularitySlaveUsage; import com.hubspot.singularity.SingularitySlaveUsageWithId; import com.hubspot.singularity.SingularityTask; @@ -200,22 +199,12 @@ public Collection checkOffers(final Collection of String slaveId = offerHolder.getSlaveId(); Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); - if (maybeSlaveUsage.isPresent() && taskManager.getActiveTaskIds().stream() - .anyMatch(t -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() - && t.getSanitizedHost().equals(offerHolder.getSanitizedHost()))) { - Optional maybeSlave = slaveManager.getSlave(slaveId); - if (maybeSlave.isPresent()) { - Optional usage = usageHelper.collectSlaveUsageSimple( - maybeSlave.get(), - System.currentTimeMillis(), - true); - if (usage.isPresent()) { - currentSlaveUsages.put(slaveId, new SingularitySlaveUsageWithId(usage.get(), slaveId)); - } else { - LOG.warn("Failed to refresh stale slave usage data for {}. Will not schedule tasks right now.", maybeSlave.get().getName()); - currentSlaveUsages.remove(slaveId); - } - } + if (configuration.isWaitForNewSlaveMetricsBeforeScheduling() + && maybeSlaveUsage.isPresent() + && taskManager.getActiveTaskIds().stream().anyMatch(t -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() + && t.getSanitizedHost().equals(offerHolder.getSanitizedHost()))) { + // Come back to this slave after we have collected more metrics + currentSlaveUsages.remove(slaveId); } }, offerScoringExecutor))); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 5b23b89503..5f90de6aa1 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -103,75 +103,7 @@ public List getSlavesToTrackUsageFor() { return slavesToTrack; } - public Optional collectSlaveUsageSimple( - SingularitySlave slave, - long now, - boolean useShortTimeout) { - Optional memoryMbTotal = Optional.absent(); - Optional cpusTotal = Optional.absent(); - Optional diskMbTotal = Optional.absent(); - - try { - List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost(), useShortTimeout); - MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost()); - - double memoryMbReservedOnSlave = 0L; - double cpuReservedOnSlave = 0.0; - double diskMbReservedOnSlave = 0L; - - double systemMemTotalBytes = 0; - double systemMemFreeBytes = 0; - double systemLoad1Min = 0; - double systemLoad5Min = 0; - double systemLoad15Min = 0; - double slaveDiskUsed = 0; - double slaveDiskTotal = 0; - double systemCpusTotal = 0; - if (slaveMetricsSnapshot != null) { - memoryMbReservedOnSlave = slaveMetricsSnapshot.getSlaveMemUsed(); - cpuReservedOnSlave = slaveMetricsSnapshot.getSlaveCpusUsed(); - diskMbReservedOnSlave = slaveMetricsSnapshot.getSlaveDiskUsed(); - systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes(); - systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes(); - systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min(); - systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min(); - systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min(); - slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed(); - slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal(); - systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal(); - } - - if (!slave.getResources().isPresent() || - !slave.getResources().get().getMemoryMegaBytes().isPresent() || - !slave.getResources().get().getNumCpus().isPresent()) { - LOG.debug("Could not find slave or resources for slave {}", slave.getId()); - } else { - memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue()); - cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue()); - diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get()); - } - - SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage( - cpuReservedOnSlave, cpuReservedOnSlave, cpusTotal, - memoryMbReservedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, - diskMbReservedOnSlave, diskMbReservedOnSlave, diskMbTotal, - allTaskUsage.size(), now, - systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, - systemLoad1Min, systemLoad5Min, systemLoad15Min, - slaveDiskUsed, slaveDiskTotal); - - LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); - usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); - return Optional.of(slaveUsage); - } catch (Throwable t) { - String message = String.format("Could not get slave usage for host %s", slave.getHost()); - LOG.error(message, t); - exceptionNotifier.notify(message, t); - } - return Optional.absent(); - } - - public Optional collectSlaveUsage( + public void collectSlaveUsage( SingularitySlave slave, long now, Map utilizationPerRequestId, @@ -342,13 +274,11 @@ public Optional collectSlaveUsage( LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage); usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage); - return Optional.of(slaveUsage); } catch (Throwable t) { String message = String.format("Could not get slave usage for host %s", slave.getHost()); LOG.error(message, t); exceptionNotifier.notify(message, t); } - return Optional.absent(); } private SingularityTaskUsage getUsage(MesosTaskMonitorObject taskUsage) { From b80e8c975d7c8e5424c4d5aeadd6e46cae036418 Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Wed, 8 Aug 2018 17:09:44 -0400 Subject: [PATCH 22/26] Add leader/web cache for request utilizatons --- .../singularity/data/SingularityWebCache.java | 25 ++++++++++ .../singularity/data/UsageManager.java | 48 +++++++++++++++++-- .../mesos/SingularityMesosOfferScheduler.java | 2 +- .../singularity/resources/UsageResource.java | 12 +++-- .../scheduler/SingularityCleaner.java | 6 ++- .../scheduler/SingularityLeaderCache.java | 26 ++++++++++ .../SingularityLeaderCacheCoordinator.java | 5 ++ .../scheduler/SingularityUsagePoller.java | 2 +- SingularityUI/app/actions/api/utilization.es6 | 4 +- .../requestDetail/RequestUtilization.jsx | 1 - 10 files changed, 118 insertions(+), 13 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityWebCache.java b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityWebCache.java index 5c4adaad48..c481a376cc 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityWebCache.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityWebCache.java @@ -10,6 +10,7 @@ import com.google.common.base.Optional; import com.google.inject.Inject; import com.google.inject.Singleton; +import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityPendingTask; import com.hubspot.singularity.SingularityPendingTaskId; import com.hubspot.singularity.SingularityRequestGroup; @@ -37,6 +38,9 @@ public class SingularityWebCache { private volatile List cachedRequestGroups; private volatile long lastRequestGroupsCache; + private volatile Map cachedRequestUtilizations; + private volatile long lastRequestUtilizationCache; + private final long cacheForMillis; private final Meter cleanupHitMeter; @@ -54,6 +58,9 @@ public class SingularityWebCache { private final Meter requestGroupsHitMeter; private final Meter requestGroupsMissMeter; + private final Meter requestUtilizationHitMeter; + private final Meter requestUtilizationMissMeter; + @Inject public SingularityWebCache(SingularityConfiguration configuration, MetricRegistry metrics) { this.cacheForMillis = configuration.getCacheForWebForMillis(); @@ -72,6 +79,9 @@ public SingularityWebCache(SingularityConfiguration configuration, MetricRegistr this.requestGroupsHitMeter = metrics.meter("zk.web.caches.requests.hits"); this.requestGroupsMissMeter = metrics.meter("zk.web.caches.requests.miss"); + + this.requestUtilizationHitMeter = metrics.meter("zk.web.caches.utilization.hits"); + this.requestUtilizationMissMeter = metrics.meter("zk.web.caches.utilization.miss"); } public boolean useCachedPendingTasks() { @@ -94,6 +104,10 @@ public boolean useCachedRequestGroups() { return useCache(lastRequestGroupsCache); } + public boolean useCachedRequestUtilizations() { + return useCache(lastRequestUtilizationCache); + } + private boolean useCache(long lastCache) { return lastCache >= 0 && (System.currentTimeMillis() - lastCache) < cacheForMillis; } @@ -128,6 +142,11 @@ public List getRequests() { return new ArrayList<>(cachedRequests.values()); } + public Map getRequestUtilizations() { + requestUtilizationHitMeter.mark(); + return new HashMap<>(cachedRequestUtilizations); + } + public Optional getRequest(String requestId) { return Optional.fromNullable(cachedRequests.get(requestId)); } @@ -179,4 +198,10 @@ public void cacheRequestGroups(List requestGroups) { lastRequestGroupsCache = System.currentTimeMillis(); } + public void cacheRequestUtilizations(Map requestUtilizations) { + requestUtilizationMissMeter.mark(); + cachedRequestUtilizations = new HashMap<>(requestUtilizations); + lastRequestUtilizationCache = System.currentTimeMillis(); + } + } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java b/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java index 23d489626b..740bf09dd1 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/UsageManager.java @@ -28,6 +28,7 @@ import com.hubspot.singularity.SingularityTaskUsage; import com.hubspot.singularity.config.SingularityConfiguration; import com.hubspot.singularity.data.transcoders.Transcoder; +import com.hubspot.singularity.scheduler.SingularityLeaderCache; @Singleton public class UsageManager extends CuratorAsyncManager { @@ -47,18 +48,23 @@ public class UsageManager extends CuratorAsyncManager { private final Transcoder taskCurrentUsageTranscoder; private final Transcoder clusterUtilizationTranscoder; private final Transcoder requestUtilizationTranscoder; + private final SingularityWebCache webCache; + private final SingularityLeaderCache leaderCache; @Inject public UsageManager(CuratorFramework curator, SingularityConfiguration configuration, MetricRegistry metricRegistry, + SingularityWebCache webCache, + SingularityLeaderCache leaderCache, Transcoder slaveUsageTranscoder, Transcoder taskUsageTranscoder, Transcoder taskCurrentUsageTranscoder, Transcoder clusterUtilizationTranscoder, Transcoder requestUtilizationTranscoder) { super(curator, configuration, metricRegistry); - + this.webCache = webCache; + this.leaderCache = leaderCache; this.slaveUsageTranscoder = slaveUsageTranscoder; this.taskUsageTranscoder = taskUsageTranscoder; this.taskCurrentUsageTranscoder = taskCurrentUsageTranscoder; @@ -184,22 +190,58 @@ public Optional getClusterUtilization() { return getData(USAGE_SUMMARY_PATH, clusterUtilizationTranscoder); } + public void activateLeaderCache() { + leaderCache.cacheRequestUtilizations(getRequestUtilizations(false)); + } + public Map getRequestUtilizations() { - return getAsyncChildren(REQUESTS_PATH, requestUtilizationTranscoder).stream() + return getRequestUtilizations(false); + } + + public Map getRequestUtilizations(boolean useWebCache) { + if (leaderCache.active()) { + return leaderCache.getRequestUtilizations(); + } + + if (useWebCache && webCache.useCachedRequestUtilizations()) { + return webCache.getRequestUtilizations(); + } + Map requestUtilizations = getAsyncChildren(REQUESTS_PATH, requestUtilizationTranscoder).stream() .collect(Collectors.toMap( RequestUtilization::getRequestId, Function.identity() )); + if (useWebCache) { + webCache.cacheRequestUtilizations(requestUtilizations); + } + return requestUtilizations; } - public Optional getRequestUtilization(String requestId) { + public Optional getRequestUtilization(String requestId, boolean useWebCache) { + if (leaderCache.active()) { + return Optional.fromNullable(leaderCache.getRequestUtilizations().get(requestId)); + } + + if (useWebCache && webCache.useCachedRequestUtilizations()) { + return Optional.fromNullable(webCache.getRequestUtilizations().get(requestId)); + } return getData(getRequestPath(requestId), requestUtilizationTranscoder); } public SingularityCreateResult saveRequestUtilization(RequestUtilization requestUtilization) { + if (leaderCache.active()) { + leaderCache.putRequestUtilization(requestUtilization); + } return save(getRequestPath(requestUtilization.getRequestId()), requestUtilization, requestUtilizationTranscoder); } + public SingularityDeleteResult deleteRequestUtilization(String requestId) { + if (leaderCache.active()) { + leaderCache.removeRequestUtilization(requestId); + } + return delete(getRequestPath(requestId)); + } + public List getCurrentSlaveUsages(List slaveIds) { List paths = new ArrayList<>(slaveIds.size()); for (String slaveId : slaveIds) { diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index c1d843a5d1..180e07b68d 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -164,7 +164,7 @@ public Collection checkOffers(final Collection of } final AtomicInteger tasksScheduled = new AtomicInteger(0); - Map requestUtilizations = usageManager.getRequestUtilizations(); + Map requestUtilizations = usageManager.getRequestUtilizations(false); List activeTaskIds = taskManager.getActiveTaskIds(); final Map currentSlaveUsagesBySlaveId = usageManager.getCurrentSlaveUsages( diff --git a/SingularityService/src/main/java/com/hubspot/singularity/resources/UsageResource.java b/SingularityService/src/main/java/com/hubspot/singularity/resources/UsageResource.java index 9f6a330754..fd035832a7 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/resources/UsageResource.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/resources/UsageResource.java @@ -7,6 +7,7 @@ import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.Produces; +import javax.ws.rs.QueryParam; import javax.ws.rs.core.MediaType; import com.google.common.base.Optional; @@ -118,15 +119,18 @@ public SingularityClusterUtilization getClusterUtilization(@Parameter(hidden = t @GET @Path("/requests") - public List getRequestUtilizations(@Auth SingularityUser user) { - return new ArrayList<>(usageManager.getRequestUtilizations().values()); + public List getRequestUtilizations(@Auth SingularityUser user, + @QueryParam("useWebCache") Boolean useWebCache) { + return new ArrayList<>(usageManager.getRequestUtilizations(useWebCache != null && useWebCache).values()); } @GET @Path("/requests/request/{requestId}") - public Optional getRequestUtilization(@Auth SingularityUser user, @PathParam("requestId") String requestId) { + public Optional getRequestUtilization(@Auth SingularityUser user, + @PathParam("requestId") String requestId, + @QueryParam("useWebCache") Boolean useWebCache) { authorizationHelper.checkForAuthorizationByRequestId(requestId, user, SingularityAuthorizationScope.READ); - return usageManager.getRequestUtilization(requestId); + return usageManager.getRequestUtilization(requestId, useWebCache != null && useWebCache); } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityCleaner.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityCleaner.java index 70864caf7d..0b1e0b45df 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityCleaner.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityCleaner.java @@ -52,6 +52,7 @@ import com.hubspot.singularity.data.DeployManager; import com.hubspot.singularity.data.RequestManager; import com.hubspot.singularity.data.TaskManager; +import com.hubspot.singularity.data.UsageManager; import com.hubspot.singularity.data.history.RequestHistoryHelper; import com.hubspot.singularity.expiring.SingularityExpiringBounce; import com.hubspot.singularity.hooks.LoadBalancerClient; @@ -74,6 +75,7 @@ public class SingularityCleaner { private final RequestHistoryHelper requestHistoryHelper; private final SingularityMesosScheduler scheduler; private final SingularitySchedulerLock lock; + private final UsageManager usageManager; private final SingularityConfiguration configuration; private final long killNonLongRunningTasksInCleanupAfterMillis; @@ -81,7 +83,7 @@ public class SingularityCleaner { @Inject public SingularityCleaner(TaskManager taskManager, SingularityDeployHealthHelper deployHealthHelper, DeployManager deployManager, RequestManager requestManager, SingularityConfiguration configuration, LoadBalancerClient lbClient, SingularityExceptionNotifier exceptionNotifier, - RequestHistoryHelper requestHistoryHelper, SingularityMesosScheduler scheduler, SingularitySchedulerLock lock) { + RequestHistoryHelper requestHistoryHelper, SingularityMesosScheduler scheduler, SingularitySchedulerLock lock, UsageManager usageManager) { this.taskManager = taskManager; this.lbClient = lbClient; this.deployHealthHelper = deployHealthHelper; @@ -91,6 +93,7 @@ public SingularityCleaner(TaskManager taskManager, SingularityDeployHealthHelper this.requestHistoryHelper = requestHistoryHelper; this.scheduler = scheduler; this.lock = lock; + this.usageManager = usageManager; this.configuration = configuration; @@ -519,6 +522,7 @@ private void cleanupRequestData(SingularityRequestCleanup requestCleanup) { taskManager.deleteRequestId(requestCleanup.getRequestId()); deployManager.deleteRequestId(requestCleanup.getRequestId()); LOG.trace("Deleted stale request data for {}", requestCleanup.getRequestId()); + usageManager.deleteRequestUtilization(requestCleanup.getRequestId()); } public int drainCleanupQueue() { diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCache.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCache.java index 7b485e607b..9ddfaeb013 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCache.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCache.java @@ -19,6 +19,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.hubspot.singularity.ExtendedTaskState; +import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityKilledTaskIdRecord; import com.hubspot.singularity.SingularityPendingTask; import com.hubspot.singularity.SingularityPendingTaskId; @@ -46,6 +47,7 @@ public class SingularityLeaderCache { private Map slaves; private Map racks; private Set pendingTaskIdsToDelete; + private Map requestUtilizations; private volatile boolean active; @@ -114,6 +116,10 @@ public void stop() { active = false; } + public void cacheRequestUtilizations(Map requestUtilizations) { + this.requestUtilizations = new HashMap<>(requestUtilizations); + } + public boolean active() { return active; } @@ -433,4 +439,24 @@ public void putRack(SingularityRack rack) { racks.put(rack.getId(), rack); } + + public void putRequestUtilization(RequestUtilization requestUtilization) { + if (!active) { + LOG.warn("putRequestUtilization {}, but not active", requestUtilization); + } + + requestUtilizations.put(requestUtilization.getRequestId(), requestUtilization); + } + + public void removeRequestUtilization(String requestId) { + if (!active) { + LOG.warn("removeRequestUtilization {}, but not active", requestId); + return; + } + requestUtilizations.remove(requestId); + } + + public Map getRequestUtilizations() { + return new HashMap<>(requestUtilizations); + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCacheCoordinator.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCacheCoordinator.java index 14199c7734..c183f54b13 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCacheCoordinator.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityLeaderCacheCoordinator.java @@ -7,6 +7,7 @@ import com.hubspot.singularity.data.RequestManager; import com.hubspot.singularity.data.SlaveManager; import com.hubspot.singularity.data.TaskManager; +import com.hubspot.singularity.data.UsageManager; @Singleton public class SingularityLeaderCacheCoordinator { @@ -16,6 +17,7 @@ public class SingularityLeaderCacheCoordinator { private final RequestManager requestManager; private final SlaveManager slaveManager; private final RackManager rackManager; + private final UsageManager usageManager; private final SingularityLeaderCache leaderCache; @Inject @@ -24,12 +26,14 @@ public SingularityLeaderCacheCoordinator(TaskManager taskManager, RequestManager requestManager, SlaveManager slaveManager, RackManager rackManager, + UsageManager usageManager, SingularityLeaderCache leaderCache) { this.taskManager = taskManager; this.deployManager = deployManager; this.requestManager = requestManager; this.slaveManager = slaveManager; this.rackManager = rackManager; + this.usageManager = usageManager; this.leaderCache = leaderCache; } @@ -39,6 +43,7 @@ public void activateLeaderCache() { requestManager.activateLeaderCache(); slaveManager.activateLeaderCache(); rackManager.activateLeaderCache(); + usageManager.activateLeaderCache(); leaderCache.activate(); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java index 011c9bb85e..23e2cca2d4 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsagePoller.java @@ -100,7 +100,7 @@ public class SingularityUsagePoller extends SingularityLeaderOnlyPoller { @Override public void runActionOnPoll() { Map utilizationPerRequestId = new ConcurrentHashMap<>(); - Map previousUtilizations = usageManager.getRequestUtilizations(); + Map previousUtilizations = usageManager.getRequestUtilizations(false); final long now = System.currentTimeMillis(); AtomicLong totalMemBytesUsed = new AtomicLong(0); diff --git a/SingularityUI/app/actions/api/utilization.es6 b/SingularityUI/app/actions/api/utilization.es6 index a5e6bf2eb3..83f1dbb505 100644 --- a/SingularityUI/app/actions/api/utilization.es6 +++ b/SingularityUI/app/actions/api/utilization.es6 @@ -11,7 +11,7 @@ export const FetchUtilization = buildApiAction( export const FetchRequestUtilizations = buildApiAction( 'FETCH_REQUEST_UTILIZATIONS', (catchStatusCodes = null) => ({ - url: '/usage/requests', + url: '/usage/requests?useWebCache=true', catchStatusCodes }) ); @@ -20,7 +20,7 @@ export const FetchRequestUtilizations = buildApiAction( export const FetchRequestUtilization = buildApiAction( 'FETCH_REQUEST_UTILIZATION', (requestId, catchStatusCodes = null) => ({ - url: `/usage/requests/request/${requestId}`, + url: `/usage/requests/request/${requestId}?useWebCache=true`, catchStatusCodes }), (requestId) => requestId diff --git a/SingularityUI/app/components/requestDetail/RequestUtilization.jsx b/SingularityUI/app/components/requestDetail/RequestUtilization.jsx index 129b4213e5..50bf2b90a7 100644 --- a/SingularityUI/app/components/requestDetail/RequestUtilization.jsx +++ b/SingularityUI/app/components/requestDetail/RequestUtilization.jsx @@ -119,7 +119,6 @@ RequestUtilization.propTypes = { const mapStateToProps = function(state, ownProps) { - console.log(state); const requestId = ownProps.requestId; return { utilization: Utils.maybe(state, ['api', 'requestUtilization', requestId, 'data']) From 8cae538d5fe151dd7db605260c1590a46ac6113f Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 9 Aug 2018 10:33:03 -0400 Subject: [PATCH 23/26] new strategy for new host overlaod check --- .../com/hubspot/mesos/client/MesosClient.java | 6 +++++- .../mesos/client/SingularityMesosClient.java | 10 ++++++--- .../config/MesosConfiguration.java | 11 ++++++++++ .../config/SingularityConfiguration.java | 10 ++++----- .../mesos/SingularityMesosOfferScheduler.java | 21 +++++++++++++------ .../scheduler/SingularityUsageHelper.java | 4 ++++ 6 files changed, 47 insertions(+), 15 deletions(-) diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java index 1ba75881eb..84c0f1ede6 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/MesosClient.java @@ -35,7 +35,11 @@ public MesosClientException(String message, Throwable cause) { public MesosMasterMetricsSnapshotObject getMasterMetricsSnapshot(String uri); - public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri); + default MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri) { + return getSlaveMetricsSnapshot(uri, false); + } + + public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri, boolean useShortTimeout); public String getSlaveUri(String hostname); diff --git a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java index ceb28d27e2..9002182d22 100644 --- a/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java +++ b/SingularityMesosClient/src/main/java/com/hubspot/mesos/client/SingularityMesosClient.java @@ -80,7 +80,11 @@ private HttpResponse getFromMesos(String uri, boolean useShortTimeout) { } private T getFromMesos(String uri, Class clazz) { - HttpResponse response = getFromMesos(uri, false); + return getFromMesos(uri, clazz, false); + } + + private T getFromMesos(String uri, Class clazz, boolean useShortTimeout) { + HttpResponse response = getFromMesos(uri, useShortTimeout); try { return response.getAs(clazz); @@ -100,8 +104,8 @@ public MesosMasterMetricsSnapshotObject getMasterMetricsSnapshot(String uri) { } @Override - public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname) { - return getFromMesos(String.format(MESOS_SLAVE_METRICS_SNAPSHOT_URL, hostname), MesosSlaveMetricsSnapshotObject.class); + public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname, boolean useShortTimeout) { + return getFromMesos(String.format(MESOS_SLAVE_METRICS_SNAPSHOT_URL, hostname), MesosSlaveMetricsSnapshotObject.class, useShortTimeout); } @Override diff --git a/SingularityService/src/main/java/com/hubspot/singularity/config/MesosConfiguration.java b/SingularityService/src/main/java/com/hubspot/singularity/config/MesosConfiguration.java index 95f0c063f6..ee547687e8 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/config/MesosConfiguration.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/config/MesosConfiguration.java @@ -65,6 +65,9 @@ public class MesosConfiguration { private double load5OverloadedThreshold = 1.0; private double load1OverloadedThreshold = 1.5; + private double recheckMetricsLoad1Threshold = 0.75; + private double recheckMetricsLoad5Threshold = 0.8; + public int getMaxNumInstancesPerRequest() { return maxNumInstancesPerRequest; } @@ -344,4 +347,12 @@ public double getDiskWeight() { public void setDiskWeight(double diskWeight) { this.diskWeight = diskWeight; } + + public double getRecheckMetricsLoad1Threshold() { + return recheckMetricsLoad1Threshold; + } + + public double getRecheckMetricsLoad5Threshold() { + return recheckMetricsLoad5Threshold; + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java index d3a5aa5218..78ac2026cd 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java @@ -378,7 +378,7 @@ public class SingularityConfiguration extends Configuration { private long maxSlaveUsageMetricAgeMs = 30000; - private boolean waitForNewSlaveMetricsBeforeScheduling = false; + private boolean reCheckMetricsForLargeNewTaskCount = false; public long getAskDriverToKillTasksAgainAfterMillis() { return askDriverToKillTasksAgainAfterMillis; @@ -1606,11 +1606,11 @@ public void setMaxSlaveUsageMetricAgeMs(long maxSlaveUsageMetricAgeMs) { this.maxSlaveUsageMetricAgeMs = maxSlaveUsageMetricAgeMs; } - public boolean isWaitForNewSlaveMetricsBeforeScheduling() { - return waitForNewSlaveMetricsBeforeScheduling; + public boolean isReCheckMetricsForLargeNewTaskCount() { + return reCheckMetricsForLargeNewTaskCount; } - public void setWaitForNewSlaveMetricsBeforeScheduling(boolean waitForNewSlaveMetricsBeforeScheduling) { - this.waitForNewSlaveMetricsBeforeScheduling = waitForNewSlaveMetricsBeforeScheduling; + public void setReCheckMetricsForLargeNewTaskCount(boolean reCheckMetricsForLargeNewTaskCount) { + this.reCheckMetricsForLargeNewTaskCount = reCheckMetricsForLargeNewTaskCount; } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index cf38e8b990..ec218fbb43 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -27,6 +27,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.inject.Inject; import com.hubspot.mesos.Resources; +import com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject; import com.hubspot.singularity.RequestType; import com.hubspot.singularity.RequestUtilization; import com.hubspot.singularity.SingularityDeployStatistics; @@ -199,12 +200,20 @@ public Collection checkOffers(final Collection of String slaveId = offerHolder.getSlaveId(); Optional maybeSlaveUsage = Optional.fromNullable(currentSlaveUsages.get(slaveId)); - if (configuration.isWaitForNewSlaveMetricsBeforeScheduling() - && maybeSlaveUsage.isPresent() - && taskManager.getActiveTaskIds().stream().anyMatch(t -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() - && t.getSanitizedHost().equals(offerHolder.getSanitizedHost()))) { - // Come back to this slave after we have collected more metrics - currentSlaveUsages.remove(slaveId); + if (configuration.isReCheckMetricsForLargeNewTaskCount() && maybeSlaveUsage.isPresent()) { + long newTaskCount = taskManager.getActiveTaskIds().stream() + .filter((t) -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() && t.getSanitizedHost().equals(offerHolder.getSanitizedHost())) + .count(); + if (newTaskCount >= maybeSlaveUsage.get().getNumTasks() / 2) { + MesosSlaveMetricsSnapshotObject metricsSnapshot = usageHelper.getMetricsSnapshot(offerHolder.getHostname()); + + if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() + || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) { + // Come back to this slave after we have collected more metrics + currentSlaveUsages.remove(slaveId); + } + } + } }, offerScoringExecutor))); } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java index 5f90de6aa1..667607f8ce 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityUsageHelper.java @@ -103,6 +103,10 @@ public List getSlavesToTrackUsageFor() { return slavesToTrack; } + public MesosSlaveMetricsSnapshotObject getMetricsSnapshot(String host) { + return mesosClient.getSlaveMetricsSnapshot(host, true); + } + public void collectSlaveUsage( SingularitySlave slave, long now, From 4ea2166aad66f3cbd18dcd5943ef6903433cc5ad Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 9 Aug 2018 10:36:20 -0400 Subject: [PATCH 24/26] fix the test client as well --- .../com/hubspot/singularity/scheduler/TestingMesosClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java index 56f17f924c..3f2272952d 100644 --- a/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java +++ b/SingularityService/src/test/java/com/hubspot/singularity/scheduler/TestingMesosClient.java @@ -51,7 +51,7 @@ public MesosMasterMetricsSnapshotObject getMasterMetricsSnapshot(String uri) { } @Override - public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname) { + public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname, boolean useShortTimeout) { return slaveMetrics.get(hostname); } From 5b822c3c15e4d750fc8dbc974f8e10730fa815b5 Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 9 Aug 2018 12:50:01 -0400 Subject: [PATCH 25/26] Add logging --- .../singularity/mesos/SingularityMesosOfferScheduler.java | 1 + 1 file changed, 1 insertion(+) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index ec218fbb43..31d3529a60 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -210,6 +210,7 @@ public Collection checkOffers(final Collection of if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) { // Come back to this slave after we have collected more metrics + LOG.info("Skipping evaluation of {} until new metrics are collected. Current load is load1: {}, load5: {}", offerHolder.getHostname(), metricsSnapshot.getSystemLoad1Min(), metricsSnapshot.getSystemLoad5Min()); currentSlaveUsages.remove(slaveId); } } From e2e67943acbf1c488f592dbc4e9ca0373c464ca4 Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 9 Aug 2018 13:35:41 -0400 Subject: [PATCH 26/26] add try/catch here --- .../mesos/SingularityMesosOfferScheduler.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java index 31d3529a60..a7a7c3e166 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/mesos/SingularityMesosOfferScheduler.java @@ -205,15 +205,21 @@ public Collection checkOffers(final Collection of .filter((t) -> t.getStartedAt() > maybeSlaveUsage.get().getTimestamp() && t.getSanitizedHost().equals(offerHolder.getSanitizedHost())) .count(); if (newTaskCount >= maybeSlaveUsage.get().getNumTasks() / 2) { - MesosSlaveMetricsSnapshotObject metricsSnapshot = usageHelper.getMetricsSnapshot(offerHolder.getHostname()); - - if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() - || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) { - // Come back to this slave after we have collected more metrics - LOG.info("Skipping evaluation of {} until new metrics are collected. Current load is load1: {}, load5: {}", offerHolder.getHostname(), metricsSnapshot.getSystemLoad1Min(), metricsSnapshot.getSystemLoad5Min()); + try { + MesosSlaveMetricsSnapshotObject metricsSnapshot = usageHelper.getMetricsSnapshot(offerHolder.getHostname()); + + if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() + || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) { + // Come back to this slave after we have collected more metrics + LOG.info("Skipping evaluation of {} until new metrics are collected. Current load is load1: {}, load5: {}", offerHolder.getHostname(), metricsSnapshot.getSystemLoad1Min(), metricsSnapshot + .getSystemLoad5Min()); + currentSlaveUsages.remove(slaveId); + } + } catch (Throwable t) { + LOG.warn("Could not check metrics for host {}, skipping", offerHolder.getHostname()); currentSlaveUsages.remove(slaveId); + } } - } } }, offerScoringExecutor)));