Skip to content

Commit

Permalink
Split upgrade tests into tests with chaos mesh and without (#1812)
Browse files Browse the repository at this point in the history
* Split upgrade tests into tests with chaos mesh and without

* Correct linting

* Correct order of factory creation

* Move flaky HA upgrade test into dedicated test suite until it is stable
  • Loading branch information
johscheuer authored Sep 22, 2023
1 parent 83bb8cf commit a408433
Show file tree
Hide file tree
Showing 8 changed files with 708 additions and 441 deletions.
19 changes: 18 additions & 1 deletion e2e/fixtures/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,24 +165,41 @@ func (factory *Factory) CreateFdbClusterFromSpec(
config *ClusterConfig,
options ...ClusterOption,
) *FdbCluster {
startTime := time.Now()
config.SetDefaults(factory)
log.Printf("create cluster: %s", ToJSON(spec))

return factory.startFDBFromClusterSpec(spec, config, options...)
cluster := factory.startFDBFromClusterSpec(spec, config, options...)
log.Println(
"FoundationDB cluster created (at version",
cluster.cluster.Spec.Version,
") in minutes",
time.Since(startTime).Minutes(),
)

return cluster
}

// CreateFdbHaCluster creates a HA FDB Cluster based on the cluster config and cluster options
func (factory *Factory) CreateFdbHaCluster(
config *ClusterConfig,
options ...ClusterOption,
) *HaFdbCluster {
startTime := time.Now()
config.SetDefaults(factory)

cluster, err := factory.ensureHAFdbClusterExists(
config,
options,
)

log.Println(
"FoundationDB HA cluster created (at version",
cluster.GetPrimary().cluster.Spec.Version,
") in minutes",
time.Since(startTime).Minutes(),
)

gomega.Expect(err).ToNot(gomega.HaveOccurred())

return cluster
Expand Down
158 changes: 158 additions & 0 deletions e2e/test_operator_ha_flaky_upgrades/operator_ha_flaky_upgrade_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* operator_ha_flaky_upgrades_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2023 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorhaflakyupgrades

/*
This test suite includes tests to validate the behaviour of the operator during upgrades on a HA FoundationDB cluster.
The executed tests include a base test without any chaos/faults.
Each test will create a new HA FoundationDB cluster which will be upgraded.
*/

import (
"log"
"time"

fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

func init() {
testOptions = fixtures.InitFlags()
}

var (
factory *fixtures.Factory
fdbCluster *fixtures.HaFdbCluster
testOptions *fixtures.FactoryOptions
)

var _ = AfterSuite(func() {
if CurrentSpecReport().Failed() {
log.Printf("failed due to %s", CurrentSpecReport().FailureMessage())
}
})

func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodChaos bool, enableHealthCheck bool) {
// We set the before version here to overwrite the before version from the specific flag
// the specific flag will be removed in the future.
factory.SetBeforeVersion(beforeVersion)

fdbCluster = factory.CreateFdbHaCluster(
fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false),
factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)...,
)
if enableHealthCheck {
Expect(
fdbCluster.GetPrimary().InvariantClusterStatusAvailableWithThreshold(15 * time.Second),
).ShouldNot(HaveOccurred())
}

if enableOperatorPodChaos && factory.ChaosTestsEnabled() {
for _, curCluster := range fdbCluster.GetAllClusters() {
factory.ScheduleInjectPodKill(
fixtures.GetOperatorSelector(curCluster.Namespace()),
"*/5 * * * *",
chaosmesh.OneMode,
)
}
}
}

func clusterSetup(beforeVersion string, enableOperatorPodChaos bool) {
clusterSetupWithHealthCheckOption(beforeVersion, enableOperatorPodChaos, true)
}

// Checks if cluster is running at the expectedVersion. This is done by checking the status of the FoundationDBCluster status.
// Before that we checked the cluster status json by checking the reported version of all processes. This approach only worked for
// version compatible upgrades, since incompatible processes won't be part of the cluster anyway. To simplify the check
// we verify the reported running version from the operator.
func checkVersion(cluster *fixtures.HaFdbCluster, expectedVersion string) {
Eventually(func() bool {
for _, singleCluster := range cluster.GetAllClusters() {
if singleCluster.GetCluster().Status.RunningVersion != expectedVersion {
return false
}
}

return true
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())
}

var _ = Describe("Operator HA Upgrades", Label("e2e", "nightly"), func() {
BeforeEach(func() {
factory = fixtures.CreateFactory(testOptions)
})

AfterEach(func() {
if CurrentSpecReport().Failed() {
fdbCluster.DumpState()
}
factory.Shutdown()
})

// https://github.com/FoundationDB/fdb-kubernetes-operator/issues/172, debug why this test is flaky and how
// to make it stable.
DescribeTable(
"when no remote processes are restarted",
func(beforeVersion string, targetVersion string) {
clusterSetup(beforeVersion, false)

// Select remote processes and use the buggify option to skip those
// processes during the restart command.
remoteProcessGroups := fdbCluster.GetRemote().GetCluster().Status.ProcessGroups
ignoreDuringRestart := make(
[]fdbv1beta2.ProcessGroupID,
0,
len(remoteProcessGroups),
)

for _, processGroup := range remoteProcessGroups {
ignoreDuringRestart = append(
ignoreDuringRestart,
processGroup.ProcessGroupID,
)
}

log.Println(
"Selected Process Groups:",
ignoreDuringRestart,
"to be skipped during the restart",
)

// We have to set this to all clusters as any operator could be doing the cluster wide restart.
for _, cluster := range fdbCluster.GetAllClusters() {
cluster.SetIgnoreDuringRestart(ignoreDuringRestart)
}

// The cluster should still be able to upgrade.
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
// Verify that the upgrade proceeds
checkVersion(fdbCluster, targetVersion)

// TODO add validation here processes are updated new version
},
EntryDescription("Upgrade from %[1]s to %[2]s"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)
})
34 changes: 34 additions & 0 deletions e2e/test_operator_ha_flaky_upgrades/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* suite_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2023 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorhaflakyupgrades

import (
"testing"
"time"

"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
. "github.com/onsi/gomega"
)

func TestOperatorHaUpgrade(t *testing.T) {
SetDefaultEventuallyTimeout(3 * time.Minute)
fixtures.RunGinkgoTests(t, "FDB Operator HA Upgrade Test Suite")
}
51 changes: 0 additions & 51 deletions e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodCh
// We set the before version here to overwrite the before version from the specific flag
// the specific flag will be removed in the future.
factory.SetBeforeVersion(beforeVersion)
startTime := time.Now()
fdbCluster = factory.CreateFdbHaCluster(
fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false),
factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)...,
Expand All @@ -73,13 +72,6 @@ func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodCh
).ShouldNot(HaveOccurred())
}

log.Println(
"FoundationDB HA cluster created (at version",
beforeVersion,
") in minutes",
time.Since(startTime).Minutes(),
)

if enableOperatorPodChaos && factory.ChaosTestsEnabled() {
for _, curCluster := range fdbCluster.GetAllClusters() {
factory.ScheduleInjectPodKill(
Expand Down Expand Up @@ -524,47 +516,4 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() {
EntryDescription("Upgrade from %[1]s to %[2]s"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)

DescribeTable(
"when no remote processes are restarted",
func(beforeVersion string, targetVersion string) {
clusterSetup(beforeVersion, false)

// Select remote processes and use the buggify option to skip those
// processes during the restart command.
remoteProcessGroups := fdbCluster.GetRemote().GetCluster().Status.ProcessGroups
ignoreDuringRestart := make(
[]fdbv1beta2.ProcessGroupID,
0,
len(remoteProcessGroups),
)

for _, processGroup := range remoteProcessGroups {
ignoreDuringRestart = append(
ignoreDuringRestart,
processGroup.ProcessGroupID,
)
}

log.Println(
"Selected Process Groups:",
ignoreDuringRestart,
"to be skipped during the restart",
)

// We have to set this to all clusters as any operator could be doing the cluster wide restart.
for _, cluster := range fdbCluster.GetAllClusters() {
cluster.SetIgnoreDuringRestart(ignoreDuringRestart)
}

// The cluster should still be able to upgrade.
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
// Verify that the upgrade proceeds
checkVersion(fdbCluster, targetVersion)

// TODO add validation here processes are updated new version
},
EntryDescription("Upgrade from %[1]s to %[2]s"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)
})
Loading

0 comments on commit a408433

Please sign in to comment.