Skip to content

Commit

Permalink
deploy: add deployer controller metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
mfojtik committed Jun 21, 2017
1 parent 6ce74a3 commit 27f7b40
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 14 deletions.
48 changes: 34 additions & 14 deletions pkg/deploy/controller/deployer/deployer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,31 @@ type fatalError string

func (e fatalError) Error() string { return "fatal error handling rollout: " + string(e) }

func NewFatalError(message string) fatalError {
RecordDeployerPodError(FatalDeployerErrorType)
return fatalError(message)
}

// actionableError is an error on which users can act.
type actionableError string

func (e actionableError) Error() string { return string(e) }

func NewActionableError(message string) actionableError {
RecordDeployerPodError(ActionableDeployerErrorType)
return actionableError(message)
}

// retryableError is an error on which the controller will retry.
type retryableError string

func (e retryableError) Error() string { return string(e) }

func NewRetryableError(message string) retryableError {
RecordDeployerPodError(RetryableDeployerErrorType)
return retryableError(message)
}

// DeploymentController starts a deployment by creating a deployer pod which
// implements a deployment strategy. The status of the deployment will follow
// the status of the deployer pod. The deployer pod is correlated to the
Expand Down Expand Up @@ -123,7 +143,7 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
if deployutil.IsDeploymentCancelled(deployment) {
nextStatus = deployapi.DeploymentStatusPending
if err := c.cleanupDeployerPods(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
break
}
Expand All @@ -137,13 +157,13 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
// Generate a deployer pod spec.
deployerPod, err := c.makeDeployerPod(deployment)
if err != nil {
return fatalError(fmt.Sprintf("couldn't make deployer pod for %q: %v", deployutil.LabelForDeploymentV1(deployment), err))
return NewFatalError(fmt.Sprintf("couldn't make deployer pod for %q: %v", deployutil.LabelForDeploymentV1(deployment), err))
}
// Create the deployer pod.
deploymentPod, err := c.pn.Pods(deployment.Namespace).Create(deployerPod)
// Retry on error.
if err != nil {
return actionableError(fmt.Sprintf("couldn't create deployer pod for %q: %v", deployutil.LabelForDeploymentV1(deployment), err))
return NewActionableError(fmt.Sprintf("couldn't create deployer pod for %q: %v", deployutil.LabelForDeploymentV1(deployment), err))
}
updatedAnnotations[deployapi.DeploymentPodAnnotation] = deploymentPod.Name
nextStatus = deployapi.DeploymentStatusPending
Expand All @@ -155,7 +175,7 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
// succeeded but the deployment state update failed and now we're re-
// entering. Ensure that the pod is the one we created by verifying the
// annotation on it, and throw a retryable error.
return fmt.Errorf("couldn't fetch existing deployer pod for %s: %v", deployutil.LabelForDeploymentV1(deployment), deployerErr)
return NewRetryableError(fmt.Sprintf("couldn't fetch existing deployer pod for %s: %v", deployutil.LabelForDeploymentV1(deployment), deployerErr))

default: /* deployerErr == nil */
// Do a stronger check to validate that the existing deployer pod is
Expand Down Expand Up @@ -187,18 +207,18 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
// Retry more before setting the deployment to Failed if it's Pending - the pod might not have
// appeared in the cache yet.
if !willBeDropped && currentStatus == deployapi.DeploymentStatusPending {
return deployerErr
return NewRetryableError(deployerErr.Error())
}
updatedAnnotations[deployapi.DeploymentStatusReasonAnnotation] = deployapi.DeploymentFailedDeployerPodNoLongerExists
c.emitDeploymentEvent(deployment, v1.EventTypeWarning, "Failed", fmt.Sprintf("Deployer pod %q has gone missing", deployerPodName))
deployerErr = fmt.Errorf("Failing rollout for %q because its deployer pod %q disappeared", deployutil.LabelForDeploymentV1(deployment), deployerPodName)
deployerErr = NewFatalError(fmt.Sprintf("Failing rollout for %q because its deployer pod %q disappeared", deployutil.LabelForDeploymentV1(deployment), deployerPodName))
utilruntime.HandleError(deployerErr)
}

// Most likely dead code since we never get an error different from 404 back from the cache.
case deployerErr != nil:
// We'll try again later on resync. Continue to process cancellations.
deployerErr = fmt.Errorf("Error getting deployer pod %q for %q: %v", deployerPodName, deployutil.LabelForDeploymentV1(deployment), deployerErr)
deployerErr = NewRetryableError(fmt.Sprintf("Error getting deployer pod %q for %q: %v", deployerPodName, deployutil.LabelForDeploymentV1(deployment), deployerErr))
utilruntime.HandleError(deployerErr)

default: /* err == nil */
Expand All @@ -208,13 +228,13 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
// Failed.
if deployutil.IsDeploymentCancelled(deployment) {
if err := c.cleanupDeployerPods(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
} else {
// Set an ownerRef for the deployment lifecycle pods so they are cleaned up when the
// replication controller is deleted.
if err := c.setDeployerPodsOwnerRef(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
}
}
Expand All @@ -224,19 +244,19 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
// were created just after we issued the first cleanup request.
if deployutil.IsDeploymentCancelled(deployment) {
if err := c.cleanupDeployerPods(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
} else {
// Set an ownerRef for the deployment lifecycle pods so they are cleaned up when the
// replication controller is deleted.
if err := c.setDeployerPodsOwnerRef(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
}

case deployapi.DeploymentStatusComplete:
if err := c.cleanupDeployerPods(deployment); err != nil {
return err
return NewRetryableError(err.Error())
}
}

Expand All @@ -260,7 +280,7 @@ func (c *DeploymentController) handle(deployment *v1.ReplicationController, will
}

if _, err := c.rn.ReplicationControllers(deployment.Namespace).Update(deployment); err != nil {
return fmt.Errorf("couldn't update rollout status for %q to %s: %v", deployutil.LabelForDeploymentV1(deployment), nextStatus, err)
return NewRetryableError(fmt.Sprintf("couldn't update rollout status for %q to %s: %v", deployutil.LabelForDeploymentV1(deployment), nextStatus, err))
}
glog.V(4).Infof("Updated rollout status for %q from %s to %s (scale: %d)", deployutil.LabelForDeploymentV1(deployment), currentStatus, nextStatus, deployment.Spec.Replicas)

Expand Down Expand Up @@ -503,7 +523,7 @@ func (c *DeploymentController) cleanupDeployerPods(deployment *v1.ReplicationCon
}

if !cleanedAll {
return actionableError(fmt.Sprintf("couldn't clean up all deployer pods for %q", deployutil.LabelForDeploymentV1(deployment)))
return NewActionableError(fmt.Sprintf("couldn't clean up all deployer pods for %q", deployutil.LabelForDeploymentV1(deployment)))
}
return nil
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/deploy/controller/deployer/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package deployment

import "github.com/prometheus/client_golang/prometheus"

// DeploymentConfigControllerSubsystem is how this controller is represented in
// prometheus metrics.
const DeployerControllerSubsystem = "deployer_controller"

var (
deployerPodErrorsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: DeployerControllerSubsystem,
Name: "failure_count",
Help: "Counter that counts total number of deployer pod errors per error type",
},
[]string{"type"},
)
)

type DeployerErrorType string

var (
ActionableDeployerErrorType DeployerErrorType = "actionable"
FatalDeployerErrorType DeployerErrorType = "fatal"
RetryableDeployerErrorType DeployerErrorType = "retriable"
)

func RecordDeployerPodError(errType DeployerErrorType) {
deployerPodErrorsCounter.WithLabelValues(string(errType)).Inc()
}

func init() {
prometheus.MustRegister(deployerPodErrorsCounter)
}

0 comments on commit 27f7b40

Please sign in to comment.