From 689d3966ad1cf1af7ae37b55c2cb3374bf075fca Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 23 May 2018 16:44:21 -0700
Subject: [PATCH] Clean up leaked deployments on restoration

This PR cancels deployments that are active but do not have a job
associated with them. This is a broken invariant that causes issues in
the deployment watcher since it will not track them. Thus they are
objects that can't be operated on or cleaned up.

Fixes https://github.com/hashicorp/nomad/issues/4286
---
 nomad/fsm.go      | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 nomad/fsm_test.go | 18 +++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/nomad/fsm.go b/nomad/fsm.go
index faa4e14bf5bd..d507ebf9b808 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -1261,6 +1261,12 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error {
 		}
 	}
 
+	// COMPAT Remove in 0.10
+	// Clean up active deployments that do not have a job
+	if err := n.failLeakedDeployments(newState); err != nil {
+		return err
+	}
+
 	// External code might be calling State(), so we need to synchronize
 	// here to make sure we swap in the new state store atomically.
 	n.stateLock.Lock()
@@ -1276,6 +1282,59 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error {
 	return nil
 }
 
+// failLeakedDeployments is used to fail deployments that do not have a job.
+// This state is a broken invariant that should not occur since 0.8.X.
+func (n *nomadFSM) failLeakedDeployments(state *state.StateStore) error {
+	// Scan for deployments that are referencing a job that no longer exists.
+	// This could happen if multiple deployments were created for a given job
+	// and thus the older deployment leaks and then the job is removed.
+	iter, err := state.Deployments(nil)
+	if err != nil {
+		return fmt.Errorf("failed to query deployments: %v", err)
+	}
+
+	dindex, err := state.Index("deployment")
+	if err != nil {
+		return fmt.Errorf("couldn't fetch index of deployments table: %v", err)
+	}
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		d := raw.(*structs.Deployment)
+
+		// We are only looking for active deployments where the job no longer
+		// exists
+		if !d.Active() {
+			continue
+		}
+
+		// Find the job
+		job, err := state.JobByID(nil, d.Namespace, d.JobID)
+		if err != nil {
+			return fmt.Errorf("failed to lookup job %s from deployment %q: %v", d.JobID, d.ID, err)
+		}
+
+		// Job exists.
+		if job != nil {
+			continue
+		}
+
+		// Update the deployment to be terminal
+		failed := d.Copy()
+		failed.Status = structs.DeploymentStatusCancelled
+		failed.StatusDescription = structs.DeploymentStatusDescriptionStoppedJob
+		if err := state.UpsertDeployment(dindex, failed); err != nil {
+			return fmt.Errorf("failed to mark leaked deployment %q as failed: %v", failed.ID, err)
+		}
+	}
+
+	return nil
+}
+
 // reconcileQueuedAllocations re-calculates the queued allocations for every job that we
 // created a Job Summary during the snap shot restore
 func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error {
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 14f984b2b60c..0f8156de0daa 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -2740,6 +2740,24 @@ func TestFSM_ReconcileSummaries(t *testing.T) {
 	}
 }
 
+func TestFSM_LeakedDeployments(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	// Add some state
+	fsm := testFSM(t)
+	state := fsm.State()
+	d := mock.Deployment()
+	require.NoError(state.UpsertDeployment(1000, d))
+
+	// Verify the contents
+	fsm2 := testSnapshotRestore(t, fsm)
+	state2 := fsm2.State()
+	out, _ := state2.DeploymentByID(nil, d.ID)
+	require.NotNil(out)
+	require.Equal(structs.DeploymentStatusCancelled, out.Status)
+}
+
 func TestFSM_Autopilot(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)