Skip to content

Commit

Permalink
Merge pull request #1822 from HubSpot/new-host-overloading
Browse files Browse the repository at this point in the history
Prevent new host overloading
  • Loading branch information
ssalinas authored Aug 16, 2018
2 parents 9003962 + e2e6794 commit 29c7199
Show file tree
Hide file tree
Showing 21 changed files with 788 additions and 486 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ public class SingularitySlaveUsage {
private final double cpusUsed;
private final double cpusReserved;
private final Optional<Double> cpusTotal;
private final long memoryBytesUsed;
private final long memoryMbReserved;
private final double memoryBytesUsed;
private final double memoryMbReserved;
private final Optional<Long> memoryMbTotal;
private final long diskBytesUsed;
private final long diskMbReserved;
private final double diskBytesUsed;
private final double diskMbReserved;
private final Optional<Long> diskMbTotal;
private final int numTasks;
private final long timestamp;
Expand All @@ -37,11 +37,11 @@ public class SingularitySlaveUsage {
public SingularitySlaveUsage(@JsonProperty("cpusUsed") double cpusUsed,
@JsonProperty("cpusReserved") double cpusReserved,
@JsonProperty("cpusTotal") Optional<Double> cpusTotal,
@JsonProperty("memoryBytesUsed") long memoryBytesUsed,
@JsonProperty("memoryMbReserved") long memoryMbReserved,
@JsonProperty("memoryBytesUsed") double memoryBytesUsed,
@JsonProperty("memoryMbReserved") double memoryMbReserved,
@JsonProperty("memoryMbTotal") Optional<Long> memoryMbTotal,
@JsonProperty("diskBytesUsed") long diskBytesUsed,
@JsonProperty("diskMbReserved") long diskMbReserved,
@JsonProperty("diskBytesUsed") double diskBytesUsed,
@JsonProperty("diskMbReserved") double diskMbReserved,
@JsonProperty("diskMbTotal") Optional<Long> diskMbTotal,
@JsonProperty("numTasks") int numTasks,
@JsonProperty("timestamp") long timestamp,
Expand Down Expand Up @@ -94,12 +94,12 @@ public Optional<Double> getCpusTotal() {
}

@Schema(description = "Total memory used by tasks in bytes")
public long getMemoryBytesUsed() {
public double getMemoryBytesUsed() {
return memoryBytesUsed;
}

@Schema(description = "Total memory reserved by tasks in MB")
public long getMemoryMbReserved() {
public double getMemoryMbReserved() {
return memoryMbReserved;
}

Expand All @@ -122,12 +122,12 @@ public Optional<Long> getMemoryBytesTotal() {
}

@Schema(description = "Total disk currently used by tasks in bytes")
public long getDiskBytesUsed() {
public double getDiskBytesUsed() {
return diskBytesUsed;
}

@Schema(description = "Total disk currently reserved by tasks in MB")
public long getDiskMbReserved() {
public double getDiskMbReserved() {
return diskMbReserved;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,20 @@ public MesosClientException(String message, Throwable cause) {

public MesosMasterMetricsSnapshotObject getMasterMetricsSnapshot(String uri);

public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri);
default MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri) {
return getSlaveMetricsSnapshot(uri, false);
}

public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String uri, boolean useShortTimeout);

public String getSlaveUri(String hostname);

public MesosSlaveStateObject getSlaveState(String uri);

public List<MesosTaskMonitorObject> getSlaveResourceUsage(String hostname);
default List<MesosTaskMonitorObject> getSlaveResourceUsage(String hostname) {
return getSlaveResourceUsage(hostname, false);
}

public List<MesosTaskMonitorObject> getSlaveResourceUsage(String hostname, boolean useShortTimeout);

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import com.google.inject.name.Named;
import com.hubspot.horizon.HttpClient;
import com.hubspot.horizon.HttpRequest;
import com.hubspot.horizon.HttpRequest.Options;
import com.hubspot.horizon.HttpResponse;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.mesos.json.MesosMasterMetricsSnapshotObject;
Expand All @@ -22,7 +23,8 @@
@Singleton
public class SingularityMesosClient implements MesosClient {

public static final String HTTP_CLIENT_NAME = "mesos.http.client";
public static final String DEFAULT_HTTP_CLIENT_NAME = "mesos.http.client";
public static final String SHORT_TIMEOUT_HTTP_CLIENT_NAME = "mesos.http.client.short.timeout";

private static final Logger LOG = LoggerFactory.getLogger(SingularityMesosClient.class);

Expand All @@ -35,10 +37,13 @@ public class SingularityMesosClient implements MesosClient {
private static final TypeReference<List<MesosTaskMonitorObject>> TASK_MONITOR_TYPE_REFERENCE = new TypeReference<List<MesosTaskMonitorObject>>() {};

private final HttpClient httpClient;
private final HttpClient shortTimeoutHttpClient;

@Inject
public SingularityMesosClient(@Named(HTTP_CLIENT_NAME) HttpClient httpClient) {
public SingularityMesosClient(@Named(DEFAULT_HTTP_CLIENT_NAME) HttpClient httpClient,
@Named(SHORT_TIMEOUT_HTTP_CLIENT_NAME) HttpClient shortTimeoutHttpClient) {
this.httpClient = httpClient;
this.shortTimeoutHttpClient = shortTimeoutHttpClient;
}

@Override
Expand All @@ -51,15 +56,16 @@ public String getMasterMetricsSnapshotUri(String hostnameAndPort) {
return String.format(MESOS_MASTER_METRICS_SNAPSHOT_URL, hostnameAndPort);
}

private HttpResponse getFromMesos(String uri) {
private HttpResponse getFromMesos(String uri, boolean useShortTimeout) {
HttpClient currentHttpClient = useShortTimeout ? shortTimeoutHttpClient : httpClient;
HttpResponse response = null;

final long start = System.currentTimeMillis();

LOG.debug("Fetching {} from mesos", uri);

try {
response = httpClient.execute(HttpRequest.newBuilder().setUrl(uri).build());
response = currentHttpClient.execute(HttpRequest.newBuilder().setUrl(uri).build(), new Options());

LOG.debug("Response {} - {} after {}", response.getStatusCode(), uri, JavaUtils.duration(start));
} catch (Exception e) {
Expand All @@ -74,7 +80,11 @@ private HttpResponse getFromMesos(String uri) {
}

private <T> T getFromMesos(String uri, Class<T> clazz) {
HttpResponse response = getFromMesos(uri);
return getFromMesos(uri, clazz, false);
}

private <T> T getFromMesos(String uri, Class<T> clazz, boolean useShortTimeout) {
HttpResponse response = getFromMesos(uri, useShortTimeout);

try {
return response.getAs(clazz);
Expand All @@ -94,8 +104,8 @@ public MesosMasterMetricsSnapshotObject getMasterMetricsSnapshot(String uri) {
}

@Override
public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname) {
return getFromMesos(String.format(MESOS_SLAVE_METRICS_SNAPSHOT_URL, hostname), MesosSlaveMetricsSnapshotObject.class);
public MesosSlaveMetricsSnapshotObject getSlaveMetricsSnapshot(String hostname, boolean useShortTimeout) {
return getFromMesos(String.format(MESOS_SLAVE_METRICS_SNAPSHOT_URL, hostname), MesosSlaveMetricsSnapshotObject.class, useShortTimeout);
}

@Override
Expand All @@ -109,10 +119,10 @@ public MesosSlaveStateObject getSlaveState(String uri) {
}

@Override
public List<MesosTaskMonitorObject> getSlaveResourceUsage(String hostname) {
public List<MesosTaskMonitorObject> getSlaveResourceUsage(String hostname, boolean useShortTimeout) {
final String uri = String.format(MESOS_SLAVE_STATISTICS_URL, hostname);

HttpResponse response = getFromMesos(uri);
HttpResponse response = getFromMesos(uri, useShortTimeout);

try {
return response.getAs(TASK_MONITOR_TYPE_REFERENCE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,26 @@
import com.google.inject.name.Names;
import com.hubspot.horizon.HttpClient;
import com.hubspot.horizon.HttpConfig;
import com.hubspot.horizon.HttpConfig.Builder;
import com.hubspot.horizon.ning.NingHttpClient;
import com.hubspot.mesos.JavaUtils;

public class SingularityMesosClientModule extends AbstractModule {

public static final String MESOS_CLIENT_OBJECT_MAPPER = "singularity.mesos.client.object.mapper";
private static final int MESOS_CLIENT_HTTP_SHORT_TIMEOUT_SECONDS = 5;

@Override
protected void configure() {
ObjectMapper objectMapper = JavaUtils.newObjectMapper();
HttpConfig httpConfig = HttpConfig.newBuilder().setObjectMapper(objectMapper).build();
HttpClient httpClient = new NingHttpClient(httpConfig);
Builder httpConfigBuilder = HttpConfig.newBuilder().setObjectMapper(objectMapper);

bind(ObjectMapper.class).annotatedWith(Names.named(MESOS_CLIENT_OBJECT_MAPPER)).toInstance(objectMapper);
bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.HTTP_CLIENT_NAME)).toInstance(httpClient);
bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.DEFAULT_HTTP_CLIENT_NAME))
.toInstance(new NingHttpClient(httpConfigBuilder.build()));

bind(HttpClient.class).annotatedWith(Names.named(SingularityMesosClient.SHORT_TIMEOUT_HTTP_CLIENT_NAME))
.toInstance(new NingHttpClient(httpConfigBuilder.setRequestTimeoutSeconds(MESOS_CLIENT_HTTP_SHORT_TIMEOUT_SECONDS).build()));

bind(MesosClient.class).to(SingularityMesosClient.class).in(Scopes.SINGLETON);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ public class MesosConfiguration {
private double load5OverloadedThreshold = 1.0;
private double load1OverloadedThreshold = 1.5;

private double recheckMetricsLoad1Threshold = 0.75;
private double recheckMetricsLoad5Threshold = 0.8;

public int getMaxNumInstancesPerRequest() {
return maxNumInstancesPerRequest;
}
Expand Down Expand Up @@ -344,4 +347,12 @@ public double getDiskWeight() {
public void setDiskWeight(double diskWeight) {
this.diskWeight = diskWeight;
}

public double getRecheckMetricsLoad1Threshold() {
return recheckMetricsLoad1Threshold;
}

public double getRecheckMetricsLoad5Threshold() {
return recheckMetricsLoad5Threshold;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,10 @@ public class SingularityConfiguration extends Configuration {

private long preemptibleTaskMaxExpectedRuntimeMs = 900000; // 15 minutes

private long maxSlaveUsageMetricAgeMs = 30000;

private boolean reCheckMetricsForLargeNewTaskCount = false;

public long getAskDriverToKillTasksAgainAfterMillis() {
return askDriverToKillTasksAgainAfterMillis;
}
Expand Down Expand Up @@ -1593,4 +1597,20 @@ public long getPreemptibleTaskMaxExpectedRuntimeMs() {
public void setPreemptibleTaskMaxExpectedRuntimeMs(long preemptibleTaskMaxExpectedRuntimeMs) {
this.preemptibleTaskMaxExpectedRuntimeMs = preemptibleTaskMaxExpectedRuntimeMs;
}

public long getMaxSlaveUsageMetricAgeMs() {
return maxSlaveUsageMetricAgeMs;
}

public void setMaxSlaveUsageMetricAgeMs(long maxSlaveUsageMetricAgeMs) {
this.maxSlaveUsageMetricAgeMs = maxSlaveUsageMetricAgeMs;
}

public boolean isReCheckMetricsForLargeNewTaskCount() {
return reCheckMetricsForLargeNewTaskCount;
}

public void setReCheckMetricsForLargeNewTaskCount(boolean reCheckMetricsForLargeNewTaskCount) {
this.reCheckMetricsForLargeNewTaskCount = reCheckMetricsForLargeNewTaskCount;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import com.google.common.base.Optional;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.hubspot.singularity.RequestUtilization;
import com.hubspot.singularity.SingularityPendingTask;
import com.hubspot.singularity.SingularityPendingTaskId;
import com.hubspot.singularity.SingularityRequestGroup;
Expand Down Expand Up @@ -37,6 +38,9 @@ public class SingularityWebCache {
private volatile List<SingularityRequestGroup> cachedRequestGroups;
private volatile long lastRequestGroupsCache;

private volatile Map<String, RequestUtilization> cachedRequestUtilizations;
private volatile long lastRequestUtilizationCache;

private final long cacheForMillis;

private final Meter cleanupHitMeter;
Expand All @@ -54,6 +58,9 @@ public class SingularityWebCache {
private final Meter requestGroupsHitMeter;
private final Meter requestGroupsMissMeter;

private final Meter requestUtilizationHitMeter;
private final Meter requestUtilizationMissMeter;

@Inject
public SingularityWebCache(SingularityConfiguration configuration, MetricRegistry metrics) {
this.cacheForMillis = configuration.getCacheForWebForMillis();
Expand All @@ -72,6 +79,9 @@ public SingularityWebCache(SingularityConfiguration configuration, MetricRegistr

this.requestGroupsHitMeter = metrics.meter("zk.web.caches.requests.hits");
this.requestGroupsMissMeter = metrics.meter("zk.web.caches.requests.miss");

this.requestUtilizationHitMeter = metrics.meter("zk.web.caches.utilization.hits");
this.requestUtilizationMissMeter = metrics.meter("zk.web.caches.utilization.miss");
}

public boolean useCachedPendingTasks() {
Expand All @@ -94,6 +104,10 @@ public boolean useCachedRequestGroups() {
return useCache(lastRequestGroupsCache);
}

public boolean useCachedRequestUtilizations() {
return useCache(lastRequestUtilizationCache);
}

private boolean useCache(long lastCache) {
return lastCache >= 0 && (System.currentTimeMillis() - lastCache) < cacheForMillis;
}
Expand Down Expand Up @@ -128,6 +142,11 @@ public List<SingularityRequestWithState> getRequests() {
return new ArrayList<>(cachedRequests.values());
}

public Map<String, RequestUtilization> getRequestUtilizations() {
requestUtilizationHitMeter.mark();
return new HashMap<>(cachedRequestUtilizations);
}

public Optional<SingularityRequestWithState> getRequest(String requestId) {
return Optional.fromNullable(cachedRequests.get(requestId));
}
Expand Down Expand Up @@ -179,4 +198,10 @@ public void cacheRequestGroups(List<SingularityRequestGroup> requestGroups) {
lastRequestGroupsCache = System.currentTimeMillis();
}

public void cacheRequestUtilizations(Map<String, RequestUtilization> requestUtilizations) {
requestUtilizationMissMeter.mark();
cachedRequestUtilizations = new HashMap<>(requestUtilizations);
lastRequestUtilizationCache = System.currentTimeMillis();
}

}
Loading

0 comments on commit 29c7199

Please sign in to comment.