Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add re-submission of tasks during spot interruption disconnects #516

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions src/main/java/hudson/plugins/ec2/EC2ComputerLauncher.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,29 @@
*/
package hudson.plugins.ec2;

import com.amazonaws.services.ec2.model.SpotInstanceRequest;
import hudson.model.Action;
import hudson.model.Actionable;
import hudson.model.Executor;
import hudson.model.Queue;
import hudson.model.Result;
import hudson.model.Slave;
import hudson.model.TaskListener;
import hudson.slaves.ComputerLauncher;
import hudson.slaves.OfflineCause;
import hudson.slaves.SlaveComputer;
import hudson.model.queue.SubTask;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.amazonaws.AmazonClientException;
import jenkins.model.CauseOfInterruption;

import javax.annotation.Nonnull;

/**
* {@link ComputerLauncher} for EC2 that wraps the real user-specified {@link ComputerLauncher}.
Expand Down Expand Up @@ -75,4 +89,95 @@ public void launch(SlaveComputer slaveComputer, TaskListener listener) {
protected abstract void launchScript(EC2Computer computer, TaskListener listener)
throws AmazonClientException, IOException, InterruptedException;


/**
* This method is called after a node disconnects. See {@link ComputerLauncher#afterDisconnect(SlaveComputer, TaskListener)}
* This method is overriden to perform a check to see if the node that is disconnected is a spot instance and
* whether the disconnection is a spot interruption event. If it is a spot interruption event, the tasks that the
* node was processing will be resubmitted if a user selects the option to do so.
* @param computer
* @param listener
*/
@Override
public void afterDisconnect(SlaveComputer computer, TaskListener listener) {
if (computer == null) return; // potential edge case where computer is null

Slave node = computer.getNode();
if (node instanceof EC2SpotSlave) {

// checking if its an unexpected disconnection
final boolean isUnexpectedDisconnection = computer.isOffline() && computer.getOfflineCause()
instanceof OfflineCause.ChannelTermination;
Comment on lines +109 to +110

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of the customers complained that OfflineCause.ChannelTermination was not always triggered for spot interruption. You may be able to dig this further here: jenkinsci/ec2-fleet-plugin#121

boolean shouldRestart = ((EC2SpotSlave) node).getRestartSpotInterruption();
if (isUnexpectedDisconnection && shouldRestart) {
SpotInstanceRequest spotRequest = ((EC2SpotSlave) node).getSpotRequest();
if (spotRequest == null) {
LOGGER.log(Level.WARNING, String.format("Could not get spot request for spot instance node %s",
node.getNodeName()));
return;
}
String code = spotRequest.getStatus().getCode();
// list of status codes - https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-bid-status.html#spot-instance-bid-status-understand
if (code.equals("instance-stopped-by-price") || code.equals("instance-stopped-no-capacity") ||
code.equals("instance-terminated-by-price") || code.equals("instance-terminated-no-capacity")) {
LOGGER.log(Level.INFO, String.format("Node %s was terminated due to spot interruption. Retriggering " +
"job", node.getNodeName()));
List<Executor> executors = computer.getExecutors();
for (Executor executor : executors) {
Queue.Executable currentExecutable = executor.getCurrentExecutable();
if (currentExecutable !=null) { // interrupting all executables
executor.interrupt(Result.ABORTED, new EC2SpotInterruptedCause(node.getNodeName()));
SubTask subTask = currentExecutable.getParent();
Queue.Task task = subTask.getOwnerTask();
// Get actions (if any)
List<Action> actions = new ArrayList<>();
if (currentExecutable instanceof Actionable) {
actions = ((Actionable) currentExecutable).getActions(Action.class);
}
LOGGER.log(Level.INFO, String.format("Spot instance for node %s was terminated. " +
"Resubmitting task %s with actions %s", node.getNodeName(), task, actions));
Queue.getInstance().schedule2(task, 10, actions);
}
}
}
}
}
}

/**
* This {@link CauseOfInterruption} is used when a Node is disconnected due to a Spot Interruption event
*/
static class EC2SpotInterruptedCause extends CauseOfInterruption {

@Nonnull
private final String nodeName;

public EC2SpotInterruptedCause(@Nonnull String nodeName) {
this.nodeName = nodeName;
}

@Override
public String getShortDescription() {
return "EC2 spot instance for node " + nodeName + " was terminated";
}

@Override
public int hashCode() {
return nodeName.hashCode();
}

@Override
public String toString() {
return getShortDescription();
}

@Override
public boolean equals(Object obj) {
if (obj instanceof EC2SpotInterruptedCause) {
return nodeName.equals(((EC2SpotInterruptedCause) obj).nodeName);
} else {
return false;
}
}
}
}
20 changes: 18 additions & 2 deletions src/main/java/hudson/plugins/ec2/EC2SpotSlave.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class EC2SpotSlave extends EC2AbstractSlave implements EC2Readiness {
private static final Logger LOGGER = Logger.getLogger(EC2SpotSlave.class.getName());

private final String spotInstanceRequestId;
private boolean restartSpotInterruption;

@Deprecated
public EC2SpotSlave(String name, String spotInstanceRequestId, String templateDescription, String remoteFS, int numExecutors, Mode mode, String initScript, String tmpDir, String labelString, String remoteAdmin, String jvmopts, String idleTerminationMinutes, List<EC2Tag> tags, String cloudName, int launchTimeout, AMITypeData amiType)
Expand All @@ -43,18 +44,25 @@ public EC2SpotSlave(String name, String spotInstanceRequestId, String templateDe
@Deprecated
public EC2SpotSlave(String name, String spotInstanceRequestId, String templateDescription, String remoteFS, int numExecutors, Mode mode, String initScript, String tmpDir, String labelString, String remoteAdmin, String jvmopts, String idleTerminationMinutes, List<EC2Tag> tags, String cloudName, boolean usePrivateDnsName, int launchTimeout, AMITypeData amiType)
throws FormException, IOException {
this(templateDescription + " (" + name + ")", spotInstanceRequestId, templateDescription, remoteFS, numExecutors, mode, initScript, tmpDir, labelString, Collections.emptyList(), remoteAdmin, jvmopts, idleTerminationMinutes, tags, cloudName, launchTimeout, amiType, ConnectionStrategy.backwardsCompatible(usePrivateDnsName, false, false), -1);
this(templateDescription + " (" + name + ")", spotInstanceRequestId, templateDescription, remoteFS, numExecutors, mode, initScript, tmpDir, labelString, Collections.emptyList(), remoteAdmin, jvmopts, idleTerminationMinutes, tags, cloudName, launchTimeout, amiType, ConnectionStrategy.backwardsCompatible(usePrivateDnsName, false, false), -1, false);
}

@DataBoundConstructor
@Deprecated
public EC2SpotSlave(String name, String spotInstanceRequestId, String templateDescription, String remoteFS, int numExecutors, Mode mode, String initScript, String tmpDir, String labelString, List<? extends NodeProperty<?>> nodeProperties, String remoteAdmin, String jvmopts, String idleTerminationMinutes, List<EC2Tag> tags, String cloudName, int launchTimeout, AMITypeData amiType, ConnectionStrategy connectionStrategy, int maxTotalUses)
throws FormException, IOException {
this(name, spotInstanceRequestId, templateDescription, remoteFS, numExecutors, mode, initScript, tmpDir, labelString, nodeProperties, remoteAdmin, jvmopts, idleTerminationMinutes, tags, cloudName, launchTimeout, amiType, connectionStrategy, maxTotalUses, false);
}

@DataBoundConstructor
public EC2SpotSlave(String name, String spotInstanceRequestId, String templateDescription, String remoteFS, int numExecutors, Mode mode, String initScript, String tmpDir, String labelString, List<? extends NodeProperty<?>> nodeProperties, String remoteAdmin, String jvmopts, String idleTerminationMinutes, List<EC2Tag> tags, String cloudName, int launchTimeout, AMITypeData amiType, ConnectionStrategy connectionStrategy, int maxTotalUses, boolean restartSpotInterruption)
throws FormException, IOException {

super(name, "", templateDescription, remoteFS, numExecutors, mode, labelString, amiType.isWindows() ? new EC2WindowsLauncher() :
new EC2UnixLauncher(), new EC2RetentionStrategy(idleTerminationMinutes), initScript, tmpDir, nodeProperties, remoteAdmin, jvmopts, false, idleTerminationMinutes, tags, cloudName, false, launchTimeout, amiType, connectionStrategy, maxTotalUses);

this.name = name;
this.spotInstanceRequestId = spotInstanceRequestId;
this.restartSpotInterruption = restartSpotInterruption;
}

@Override
Expand Down Expand Up @@ -190,6 +198,14 @@ public void onConnected() {
pushLiveInstancedata();
}

/**
* Gets whether the node has the setting configured to restart all its tasks when a spot interruption event occurs
* @return true if the node's tasks should be restarted
*/
public boolean getRestartSpotInterruption() {
return restartSpotInterruption;
}

@Extension
public static final class DescriptorImpl extends EC2AbstractSlave.DescriptorImpl {

Expand Down
1 change: 1 addition & 0 deletions src/main/java/hudson/plugins/ec2/SlaveTemplate.java
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,7 @@ protected EC2SpotSlave newSpotSlave(SpotInstanceRequest sir) throws FormExceptio
.withAmiType(amiType)
.withConnectionStrategy(connectionStrategy)
.withMaxTotalUses(maxTotalUses)
.withRestartSpotInterruption(spotConfig.getRestartSpotInterruption())
.build();
return EC2AgentFactory.getInstance().createSpotAgent(config);
}
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/hudson/plugins/ec2/SpotConfiguration.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.Collection;
import java.util.Date;
import java.util.Objects;
import javax.annotation.Nonnull;
import javax.servlet.ServletException;
import org.kohsuke.stapler.DataBoundConstructor;
import org.kohsuke.stapler.DataBoundSetter;
Expand All @@ -31,6 +32,7 @@ public final class SpotConfiguration extends AbstractDescribableImpl<SpotConfigu
private String spotMaxBidPrice;
private boolean fallbackToOndemand;
private int spotBlockReservationDuration;
private boolean restartSpotInterruption;

@Deprecated
public SpotConfiguration(boolean useBidPrice, String spotMaxBidPrice, boolean fallbackToOndemand, String spotBlockReservationDurationStr) {
Expand Down Expand Up @@ -122,8 +124,18 @@ public static String normalizeBid(String bid) {

}

public boolean getRestartSpotInterruption() {
return restartSpotInterruption;
}

@DataBoundSetter
public void setRestartSpotInterruption(boolean restartSpotInterruption) {
this.restartSpotInterruption = restartSpotInterruption;
}

@Extension
public static class DescriptorImpl extends Descriptor<SpotConfiguration> {
@Nonnull
@Override
public String getDisplayName() {
return "spotConfig";
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/hudson/plugins/ec2/util/EC2AgentConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,12 @@ private OnDemand(OnDemandBuilder builder) {
public static class Spot extends EC2AgentConfig {

final String spotInstanceRequestId;
boolean restartSpotInterruption;

private Spot(SpotBuilder builder) {
super(builder);
this.spotInstanceRequestId = builder.spotInstanceRequestId;
this.restartSpotInterruption = builder.restartSpotInterruption;
}
}

Expand Down Expand Up @@ -265,12 +267,18 @@ public OnDemand build() {
public static class SpotBuilder extends Builder<SpotBuilder, Spot> {

private String spotInstanceRequestId;
private boolean restartSpotInterruption;

public SpotBuilder withSpotInstanceRequestId(String spotInstanceRequestId) {
this.spotInstanceRequestId = spotInstanceRequestId;
return this;
}

public SpotBuilder withRestartSpotInterruption(boolean restartSpotInterruption) {
this.restartSpotInterruption = restartSpotInterruption;
return this;
}

@Override
protected SpotBuilder self() {
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ public EC2OndemandSlave createOnDemandAgent(EC2AgentConfig.OnDemand config)

@Override
public EC2SpotSlave createSpotAgent(EC2AgentConfig.Spot config) throws Descriptor.FormException, IOException {
return new EC2SpotSlave(config.name, config.spotInstanceRequestId, config.description, config.remoteFS, config.numExecutors, config.mode, config.initScript, config.tmpDir, config.labelString, config.nodeProperties, config.remoteAdmin, config.jvmopts, config.idleTerminationMinutes, config.tags, config.cloudName, config.launchTimeout, config.amiType, config.connectionStrategy, config.maxTotalUses);
return new EC2SpotSlave(config.name, config.spotInstanceRequestId, config.description, config.remoteFS, config.numExecutors, config.mode, config.initScript, config.tmpDir, config.labelString, config.nodeProperties, config.remoteAdmin, config.jvmopts, config.idleTerminationMinutes, config.tags, config.cloudName, config.launchTimeout, config.amiType, config.connectionStrategy, config.maxTotalUses, config.restartSpotInterruption);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,8 @@ THE SOFTWARE.
<f:entry title="${%Spot Block Reservation Duration}" field="spotBlockReservationDuration">
<f:number />
</f:entry>

<f:entry title="Restart tasks if spot interruted" field="restartSpotInterruption">
<f:checkbox />
</f:entry>
</j:jelly>
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div>
If set to true, the tasks that a node was running will be restarted if a spot interruption event has happened.
Find out more information on Spot Interruption <a href="https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html">here</a>
</div>
52 changes: 44 additions & 8 deletions src/test/java/hudson/plugins/ec2/SlaveTemplateTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import com.amazonaws.services.ec2.model.InstanceState;
import com.amazonaws.services.ec2.model.InstanceType;
import com.amazonaws.services.ec2.model.KeyPair;
import com.amazonaws.services.ec2.model.RequestSpotInstancesRequest;
import com.amazonaws.services.ec2.model.RunInstancesRequest;
import com.amazonaws.services.ec2.model.RunInstancesResult;
import com.amazonaws.services.ec2.model.SecurityGroup;
Expand All @@ -60,6 +61,7 @@

import org.mockito.ArgumentCaptor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
Expand Down Expand Up @@ -281,6 +283,39 @@ public void testSpotConfigWithFallback() throws Exception {
r.assertEqualBeans(orig, received, "ami,zone,spotConfig,description,remoteFS,type,jvmopts,stopOnTerminate,securityGroups,subnetId,tags,connectionStrategy,hostKeyVerificationStrategy");
}

/**
* Test to make sure the slave created has been configured properly with the restart spot interruption configuration
* correctly set
* @throws Exception - Exception that can be thrown by the Jenkins test harness
*/
@Test
public void testRestartSpotInterruption() throws Exception {
String ami = "ami1";
String description = "foo ami";

EC2Tag tag1 = new EC2Tag("name1", "value1");
EC2Tag tag2 = new EC2Tag("name2", "value2");
List<EC2Tag> tags = new ArrayList<EC2Tag>();
tags.add(tag1);
tags.add(tag2);

SpotConfiguration spotConfig = new SpotConfiguration(true);
spotConfig.setSpotMaxBidPrice("0.1");
spotConfig.setFallbackToOndemand(true);
spotConfig.setSpotBlockReservationDuration(0);
spotConfig.setRestartSpotInterruption(true);

SlaveTemplate slaveTemplate = new SlaveTemplate(ami, EC2AbstractSlave.TEST_ZONE, spotConfig, "default", "foo", InstanceType.M1Large, false, "ttt", Node.Mode.NORMAL, "foo ami", "bar", "bbb", "aaa", "10", "fff", null, "-Xmx1g", false, "subnet 456", tags, null, true, null, "", false, false, "", false, "");
List<SlaveTemplate> templates = new ArrayList<>();
templates.add(slaveTemplate);

AmazonEC2Cloud ac = new AmazonEC2Cloud("us-east-1", false, "abc", "us-east-1", "ghi", "3", templates, null, null);
r.jenkins.clouds.add(ac);

SlaveTemplate received = ((EC2Cloud) r.jenkins.clouds.iterator().next()).getTemplate(description);
r.assertEqualBeans(slaveTemplate, received, "ami,zone,spotConfig,description,remoteFS,type,jvmopts,stopOnTerminate,securityGroups,subnetId,tags,connectionStrategy,hostKeyVerificationStrategy");
}

/**
* Test to make sure the IAM Role is set properly.
*
Expand Down Expand Up @@ -480,8 +515,8 @@ public void testMinimumNumberOfInstancesActiveRangeConfig() throws Exception {
Assert.assertEquals(true, stored.getMinimumNoInstancesActiveTimeRangeDays().get("tuesday"));
}

@Test
public void provisionOndemandSetsAwsNetworkingOnEc2Request() throws Exception {
@Test
public void provisionOndemandSetsAwsNetworkingOnEc2Request() throws Exception {
boolean associatePublicIp = false;
String ami = "ami1";
String description = "foo ami";
Expand Down Expand Up @@ -521,10 +556,10 @@ public void provisionOndemandSetsAwsNetworkingOnEc2Request() throws Exception {
assertEquals(actualRequest.getSecurityGroups(), Stream.of(securityGroups).collect(Collectors.toList()));
}
}
}
}

@Test
public void provisionOndemandSetsAwsNetworkingOnNetworkInterface() throws Exception {
@Test
public void provisionOndemandSetsAwsNetworkingOnNetworkInterface() throws Exception {
boolean associatePublicIp = true;
String ami = "ami1";
String description = "foo ami";
Expand Down Expand Up @@ -561,9 +596,9 @@ public void provisionOndemandSetsAwsNetworkingOnNetworkInterface() throws Except
assertEquals(actualRequest.getSecurityGroupIds(), Collections.emptyList());
assertEquals(actualRequest.getSecurityGroups(), Collections.emptyList());
}
}
}

private AmazonEC2 setupTestForProvisioning(SlaveTemplate template) throws Exception {
private AmazonEC2 setupTestForProvisioning(SlaveTemplate template) throws Exception {
AmazonEC2Cloud mockedCloud = mock(AmazonEC2Cloud.class);
AmazonEC2 mockedEC2 = mock(AmazonEC2.class);
EC2PrivateKey mockedPrivateKey = mock(EC2PrivateKey.class);
Expand Down Expand Up @@ -614,5 +649,6 @@ private AmazonEC2 setupTestForProvisioning(SlaveTemplate template) throws Except
when(mockedEC2.runInstances(any(RunInstancesRequest.class))).thenReturn(mockedResult);

return mockedEC2;
}
}

}
Loading