Skip to content

Commit

Permalink
Fix broken upgrade (#85)
Browse files Browse the repository at this point in the history
* Adding CheckRackForceUpgrade()

* gofmt

* Ran codegen

* Adding an int test

* Update crd in helm

* Trying to deal with updating the release script

* Fix codegen empty lines after merge
  • Loading branch information
jimdickinson authored May 21, 2020
1 parent 1845cab commit 86b869d
Show file tree
Hide file tree
Showing 9 changed files with 11,851 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ spec:
configBuilderImage:
description: Container image for the config builder init container.
type: string
forceUpgradeRacks:
description: Rack names in this list are set to the latest StatefulSet
configuration even if Cassandra nodes are down. Use this to recover
from an upgrade that couldn't roll out.
items:
type: string
type: array
managementApiAuth:
description: Config for the Management API certificates
properties:
Expand Down
5,828 changes: 5,822 additions & 6 deletions docs/user/cass-operator-manifests-pre-1.15.yaml

Large diffs are not rendered by default.

5,828 changes: 5,822 additions & 6 deletions docs/user/cass-operator-manifests.yaml

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion hack/release/make-yaml-bundle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ echo '---' >> "$bundle"
cat operator/deploy/namespace.yaml | yq r - >> "$bundle"

echo '---' >> "$bundle"
helm template ./charts/cass-operator-chart/ -n cass-operator | kubectl create --dry-run=client -o yaml -n cass-operator -f - >> "$bundle"
helm template ./charts/cass-operator-chart/ -n cass-operator | kubectl create --validate=false --dry-run=client -o yaml -n cass-operator -f - >> "$bundle"

# k8s before 1.15 doesn't understand x-kubernetes-list-map-keys, which is an array of strings

grep -v "x-kubernetes-preserve-unknown-fields\|matchPolicy" < "$bundle" > docs/user/cass-operator-manifests-pre-1.15.yaml
mv "$bundle" docs/user/cass-operator-manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ spec:
configBuilderImage:
description: Container image for the config builder init container.
type: string
forceUpgradeRacks:
description: Rack names in this list are set to the latest StatefulSet
configuration even if Cassandra nodes are down. Use this to recover
from an upgrade that couldn't roll out.
items:
type: string
type: array
managementApiAuth:
description: Config for the Management API certificates
properties:
Expand Down
13 changes: 9 additions & 4 deletions operator/pkg/apis/cassandra/v1beta1/cassandradatacenter_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ type CassandraDatacenterSpec struct {
// More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector
NodeSelector map[string]string `json:"nodeSelector,omitempty"`

// Rack names in this list are set to the latest StatefulSet configuration
// even if Cassandra nodes are down. Use this to recover from an upgrade that couldn't
// roll out.
ForceUpgradeRacks []string `json:"forceUpgradeRacks,omitempty"`

// PodTemplate provides customisation options (labels, annotations, affinity rules, resource requests, and so on) for the cassandra pods
PodTemplateSpec *corev1.PodTemplateSpec `json:"podTemplateSpec,omitempty"`
}
Expand Down Expand Up @@ -200,14 +205,14 @@ const (
)

type DatacenterCondition struct {
Type DatacenterConditionType `json:"type"`
Status corev1.ConditionStatus `json:"status"`
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
Type DatacenterConditionType `json:"type"`
Status corev1.ConditionStatus `json:"status"`
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
}

func NewDatacenterCondition(conditionType DatacenterConditionType, status corev1.ConditionStatus) *DatacenterCondition {
return &DatacenterCondition{
Type: conditionType,
Type: conditionType,
Status: status,
}
}
Expand Down
5 changes: 5 additions & 0 deletions operator/pkg/apis/cassandra/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

80 changes: 80 additions & 0 deletions operator/pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/util/slice"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

Expand Down Expand Up @@ -273,6 +274,81 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
return result.Continue()
}

func (rc *ReconciliationContext) CheckRackForceUpgrade() result.ReconcileResult {
// This code is *very* similar to CheckRackPodTemplate(), but it's not an exact
// copy. Some 3 to 5 line parts could maybe be extracted into functions.
logger := rc.ReqLogger
dc := rc.Datacenter
logger.Info("starting CheckRackForceUpgrade()")

forceRacks := dc.Spec.ForceUpgradeRacks
if len(forceRacks) == 0 {
return result.Continue()
}

for idx := range rc.desiredRackInformation {
rackName := rc.desiredRackInformation[idx].RackName
if slice.ContainsString(forceRacks, rackName, nil) {

statefulSet := rc.statefulSets[idx]

// have to use zero here, because each statefulset is created with no replicas
// in GetStatefulSetForRack()
desiredSts, err := newStatefulSetForCassandraDatacenter(rackName, dc, 0)
if err != nil {
logger.Error(err, "error calling newStatefulSetForCassandraDatacenter")
return result.Error(err)
}

// "fix" the replica count, and maintain labels and annotations the k8s admin may have set
desiredSts.Spec.Replicas = statefulSet.Spec.Replicas
desiredSts.Labels = utils.MergeMap(map[string]string{}, statefulSet.Labels, desiredSts.Labels)
desiredSts.Annotations = utils.MergeMap(map[string]string{}, statefulSet.Annotations, desiredSts.Annotations)

desiredSts.DeepCopyInto(statefulSet)

rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.UpdatingRack,
"Force updating rack %s", rackName)

dcPatch := client.MergeFrom(dc.DeepCopy())
rc.setCondition(api.NewDatacenterCondition(api.DatacenterUpdating, corev1.ConditionTrue))

if err := rc.Client.Status().Patch(rc.Ctx, dc, dcPatch); err != nil {
logger.Error(err, "error patching datacenter status for updating condition")
return result.Error(err)
}

if err := setOperatorProgressStatus(rc, api.ProgressUpdating); err != nil {
return result.Error(err)
}

logger.Info("Force updating statefulset pod specs",
"statefulSet", statefulSet,
)

if err := rc.Client.Update(rc.Ctx, statefulSet); err != nil {
logger.Error(
err,
"Unable to perform update on statefulset for force update config",
"statefulSet", statefulSet)
return result.Error(err)
}

}
}

dcPatch := client.MergeFrom(dc.DeepCopy())
dc.Spec.ForceUpgradeRacks = nil

if err := rc.Client.Patch(rc.Ctx, dc, dcPatch); err != nil {
logger.Error(err, "error patching datacenter to clear force upgrade")
return result.Error(err)
}

logger.Info("done CheckRackForceUpgrade()")
return result.Done()
}

func (rc *ReconciliationContext) CheckRackLabels() result.ReconcileResult {
rc.ReqLogger.Info("reconcile_racks::CheckRackLabels")

Expand Down Expand Up @@ -1857,6 +1933,10 @@ func (rc *ReconciliationContext) ReconcileAllRacks() (reconcile.Result, error) {
return recResult.Output()
}

if recResult := rc.CheckRackForceUpgrade(); recResult.Completed() {
return recResult.Output()
}

if recResult := rc.CheckRackScale(); recResult.Completed() {
return recResult.Output()
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright DataStax, Inc.
// Please see the included license file for details.

package test_bad_config_and_fix

import (
"fmt"
"testing"
"time"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"

ginkgo_util "github.com/datastax/cass-operator/mage/ginkgo"
"github.com/datastax/cass-operator/mage/kubectl"
)

var (
testName = "test rolling out a bad config and fixing it"
namespace = "test-bad-config-and-fix"
dcName = "dc1"
dcYaml = "../testdata/oss-three-rack-three-node-dc.yaml"
dcResource = fmt.Sprintf("CassandraDatacenter/%s", dcName)
dcLabel = fmt.Sprintf("cassandra.datastax.com/datacenter=%s", dcName)
ns = ginkgo_util.NewWrapper(testName, namespace)
)

func TestLifecycle(t *testing.T) {
AfterSuite(func() {
logPath := fmt.Sprintf("%s/aftersuite", ns.LogDir)
kubectl.DumpAllLogs(logPath).ExecV()
fmt.Printf("\n\tPost-run logs dumped at: %s\n\n", logPath)
ns.Terminate()
})

RegisterFailHandler(Fail)
RunSpecs(t, testName)
}

var _ = Describe(testName, func() {
Context("when in a new cluster", func() {
Specify("the operator can scale up, stop, resume, and terminate a datacenter", func() {
By("creating a namespace")
err := kubectl.CreateNamespace(namespace).ExecV()
Expect(err).ToNot(HaveOccurred())

step := "setting up cass-operator resources via helm chart"
ns.HelmInstall("../../charts/cass-operator-chart")

ns.WaitForOperatorReady()

step = "creating a datacenter resource with 3 racks/3 nodes"
k := kubectl.ApplyFiles(dcYaml)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterReady(dcName)
ns.WaitForDatacenterCondition(dcName, "Ready", string(corev1.ConditionTrue))
ns.WaitForDatacenterCondition(dcName, "Initialized", string(corev1.ConditionTrue))

step = "apply a bad image"
json := "{\"spec\": {\"serverImage\": \"datastax/cassandra-v314159\"}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)

time.Sleep(time.Minute * 6)
ns.WaitForDatacenterReadyPodCount(dcName, 2)

step = "apply a good image"
json = "{\"spec\": {\"serverImage\": \"\"}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

step = "set the forceUpgradeRacks config"
json = "{\"spec\": {\"forceUpgradeRacks\": [\"r1\"]}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterReady(dcName)

step = "deleting the dc"
k = kubectl.DeleteFromFiles(dcYaml)
ns.ExecAndLog(step, k)

step = "checking that the dc no longer exists"
json = "jsonpath={.items}"
k = kubectl.Get("CassandraDatacenter").
WithLabel(dcLabel).
FormatOutput(json)
ns.WaitForOutputAndLog(step, k, "[]", 300)
})
})
})

0 comments on commit 86b869d

Please sign in to comment.