Skip to content

Commit

Permalink
Add configsync_sync_generation metric resource tag
Browse files Browse the repository at this point in the history
- Add configsync.gke.io/sync-generation reconciler deployment/pod label,
  populated by the reconciler-manager
- Add CONFIGSYNC_SYNC_GENERATION env var on the otel-agent container,
  populated by the k8s downward API from the pod label
- Add configsync_sync_generation metric resource label on the otel-agent
  config, populated from the env var
- Add attribute filter to delete the configsync.sync.generation label
  in the otel-collector config, when sending to Monarch
- Change the e2e tests to use the sync metric labels, instead of the
  pod name. This allows the metrics validation to tolerate pod
  replacement due to rescheduling, oomkill, or autoscaling
  • Loading branch information
karlkfi committed Jul 25, 2023
1 parent 8973d80 commit ca0b4cb
Show file tree
Hide file tree
Showing 23 changed files with 426 additions and 237 deletions.
221 changes: 122 additions & 99 deletions e2e/nomostest/prometheus_metrics.go

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions e2e/testcases/cluster_selectors_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -853,17 +853,15 @@ func TestClusterSelectorAnnotationConflicts(t *testing.T) {
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("Add both cluster selector annotations to a role binding"))
nt.WaitForRootSyncSourceError(configsync.RootSyncName, selectors.ClusterSelectorAnnotationConflictErrorCode, "")

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncNN := nomostest.RootSyncNN(configsync.RootSyncName)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Source: 1,
}))
if err != nil {
Expand Down
8 changes: 2 additions & 6 deletions e2e/testcases/custom_resource_definitions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
"kpt.dev/configsync/e2e/nomostest/testwatcher"
"kpt.dev/configsync/pkg/api/configmanagement"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/importer/analyzer/validation/nonhierarchical"
"kpt.dev/configsync/pkg/kinds"
Expand Down Expand Up @@ -80,17 +79,14 @@ func mustRemoveCustomResourceWithDefinition(nt *nomostest.NT, crd client.Object)

nt.WaitForRootSyncSourceError(configsync.RootSyncName, nonhierarchical.UnsupportedCRDRemovalErrorCode, "")

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Source: 1,
}))
if err != nil {
Expand Down
16 changes: 5 additions & 11 deletions e2e/testcases/custom_resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
"kpt.dev/configsync/e2e/nomostest/testwatcher"
"kpt.dev/configsync/pkg/api/configmanagement"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/kinds"
"kpt.dev/configsync/pkg/status"
Expand Down Expand Up @@ -119,17 +118,14 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1Beta1(t *testing.T) {

nt.WaitForRootSyncSourceError(configsync.RootSyncName, status.UnknownKindErrorCode, "")

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Source: 1,
}))
if err != nil {
Expand Down Expand Up @@ -234,20 +230,18 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1(t *testing.T) {

nt.WaitForRootSyncSourceError(configsync.RootSyncName, status.UnknownKindErrorCode, "")

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, firstCommitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, firstCommitHash, metrics.ErrorSummary{
// Remediator conflict after the first commit, because the declared
// Anvil was deleted by another client after successful sync.
Conflicts: 1,
}),
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, secondCommitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, secondCommitHash, metrics.ErrorSummary{
// No remediator conflict after the second commit, because the
// reconciler hasn't been updated with the latest declared resources,
// because there was a source error.
Expand Down
36 changes: 14 additions & 22 deletions e2e/testcases/invalid_git_branch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ import (
"kpt.dev/configsync/e2e/nomostest/metrics"
"kpt.dev/configsync/e2e/nomostest/ntopts"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/pkg/api/configmanagement"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/core"
"kpt.dev/configsync/pkg/status"
"kpt.dev/configsync/pkg/testing/fake"
)
Expand All @@ -38,17 +36,15 @@ func TestInvalidRootSyncBranchStatus(t *testing.T) {

nt.WaitForRootSyncSourceError(configsync.RootSyncName, status.SourceErrorCode, "")

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncNN := nomostest.RootSyncNN(configsync.RootSyncName)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Source: 1,
}))
if err != nil {
Expand All @@ -69,10 +65,10 @@ func TestInvalidRootSyncBranchStatus(t *testing.T) {

func TestInvalidRepoSyncBranchStatus(t *testing.T) {
nt := nomostest.New(t, nomostesting.SyncSource, ntopts.NamespaceRepo(namespaceRepo, configsync.RepoSyncName))
nn := nomostest.RepoSyncNN(namespaceRepo, configsync.RepoSyncName)
rs := nomostest.RepoSyncObjectV1Beta1FromNonRootRepo(nt, nn)
rs.Spec.Branch = "invalid-branch"
nt.Must(nt.RootRepos[configsync.RootSyncName].Add(nomostest.StructuredNSPath(namespaceRepo, rs.Name), rs))
repoSyncNN := nomostest.RepoSyncNN(namespaceRepo, configsync.RepoSyncName)
repoSync := nomostest.RepoSyncObjectV1Beta1FromNonRootRepo(nt, repoSyncNN)
repoSync.Spec.Branch = "invalid-branch"
nt.Must(nt.RootRepos[configsync.RootSyncName].Add(nomostest.StructuredNSPath(namespaceRepo, repoSync.Name), repoSync))
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("Update RepoSync to invalid branch name"))

nt.WaitForRepoSyncSourceError(namespaceRepo, configsync.RepoSyncName, status.SourceErrorCode, "")
Expand All @@ -85,31 +81,27 @@ func TestInvalidRepoSyncBranchStatus(t *testing.T) {
nt.T.Fatal(err)
}

nsReconcilerName := core.NsReconcilerName(nn.Namespace, nn.Name)
nsReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nsReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
repoSyncLabels, err := nomostest.MetricLabelsForRepoSync(nt, repoSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)
commitHash := nt.NonRootRepos[repoSyncNN].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
// Source error prevents apply, so don't wait for a sync with the current commit.
nomostest.ReconcilerErrorMetrics(nt, nsReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, repoSyncLabels, commitHash, metrics.ErrorSummary{
Source: 1,
}))
if err != nil {
nt.T.Fatal(err)
}

rs.Spec.Branch = gitproviders.MainBranch
nt.Must(nt.RootRepos[configsync.RootSyncName].Add(nomostest.StructuredNSPath(namespaceRepo, rs.Name), rs))
repoSync.Spec.Branch = gitproviders.MainBranch
nt.Must(nt.RootRepos[configsync.RootSyncName].Add(nomostest.StructuredNSPath(namespaceRepo, repoSync.Name), repoSync))
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("Update RepoSync to valid branch name"))

// Ensure RepoSync's active branch is checked out, so the correct commit is used for validation.
nt.Must(nt.NonRootRepos[nn].CheckoutBranch(gitproviders.MainBranch))
nt.Must(nt.NonRootRepos[repoSyncNN].CheckoutBranch(gitproviders.MainBranch))

if err := nt.WatchForAllSyncs(); err != nil {
nt.T.Fatal(err)
Expand All @@ -124,7 +116,7 @@ func TestInvalidRepoSyncBranchStatus(t *testing.T) {
}

err = nomostest.ValidateStandardMetricsForRepoSync(nt, metrics.Summary{
Sync: nn,
Sync: repoSyncNN,
ObjectCount: 0, // no additional managed objects
})
if err != nil {
Expand Down
31 changes: 11 additions & 20 deletions e2e/testcases/multi_sync_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import (
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
"kpt.dev/configsync/e2e/nomostest/testwatcher"
"kpt.dev/configsync/pkg/api/configmanagement"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/api/configsync/v1beta1"
"kpt.dev/configsync/pkg/applier"
Expand Down Expand Up @@ -243,6 +242,8 @@ func TestMultiSyncs_Unstructured_MixedControl(t *testing.T) {
validateReconcilerResource(nt, kinds.Secret(), map[string]string{metadata.SyncNamespaceLabel: testNs}, 5)
validateReconcilerResource(nt, kinds.Secret(), map[string]string{metadata.SyncNamespaceLabel: testNs2}, 1)
validateReconcilerResource(nt, kinds.Secret(), map[string]string{metadata.SyncNameLabel: nr1}, 2)

// TODO: validate sync-generation label
}

func validateReconcilerResource(nt *nomostest.NT, gvk schema.GroupVersionKind, labels map[string]string, expectedCount int) {
Expand Down Expand Up @@ -303,23 +304,19 @@ func TestConflictingDefinitions_RootToNamespace(t *testing.T) {
nt.WaitForRepoSyncSyncError(repoSyncNN.Namespace, repoSyncNN.Name, status.ManagementConflictErrorCode, "declared in another repository")

nt.T.Logf("Validate reconciler error metric is emitted from namespace reconciler %s", repoSyncNN)
nsReconcilerName := core.NsReconcilerName(repoSyncNN.Namespace, repoSyncNN.Name)
nsReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nsReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}

commitHash := nt.NonRootRepos[repoSyncNN].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
// ManagementConflictErrorWrap is recorded by the remediator, while
// KptManagementConflictError is recorded by the applier, but they have
// similar error messages. So while there should be a ReconcilerError
// metric, there might not be a LastSyncTimestamp with status=error.
// nomostest.ReconcilerSyncError(nt, nsReconcilerPod.Name, commitHash),
nomostest.ReconcilerErrorMetrics(nt, nsReconcilerPod.Name, commitHash, metrics.ErrorSummary{
// nomostest.ReconcilerSyncError(nt, rootSyncLabels, commitHash),
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Sync: 1,
}))
if err != nil {
Expand Down Expand Up @@ -437,18 +434,15 @@ func TestConflictingDefinitions_NamespaceToRoot(t *testing.T) {
}

// Validate reconciler error metric is emitted from namespace reconciler.
nsReconcilerName := core.NsReconcilerName(repoSyncNN.Namespace, repoSyncNN.Name)
nsReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nsReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRepoSync(nt, repoSyncNN)
if err != nil {
nt.T.Fatal(err)
}
commitHash := nt.NonRootRepos[repoSyncNN].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerSyncError(nt, nsReconcilerPod.Name, commitHash),
nomostest.ReconcilerErrorMetrics(nt, nsReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerSyncError(nt, rootSyncLabels, commitHash),
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Sync: 1,
}))
if err != nil {
Expand Down Expand Up @@ -697,18 +691,15 @@ func TestConflictingDefinitions_NamespaceToNamespace(t *testing.T) {
nt.T.Fatal(err)
}
nt.T.Logf("Validate reconciler error metric is emitted from Namespace reconciler %s", repoSyncNN2)
nsReconciler2Name := core.NsReconcilerName(repoSyncNN2.Namespace, repoSyncNN2.Name)
nsReconciler2Pod, err := nt.KubeClient.GetDeploymentPod(
nsReconciler2Name, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
repoSync2Labels, err := nomostest.MetricLabelsForRepoSync(nt, repoSyncNN2)
if err != nil {
nt.T.Fatal(err)
}
commitHash := nt.NonRootRepos[repoSyncNN2].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerSyncError(nt, nsReconciler2Pod.Name, commitHash),
nomostest.ReconcilerErrorMetrics(nt, nsReconciler2Pod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerSyncError(nt, repoSync2Labels, commitHash),
nomostest.ReconcilerErrorMetrics(nt, repoSync2Labels, commitHash, metrics.ErrorSummary{
Sync: 1,
}))
if err != nil {
Expand Down
16 changes: 5 additions & 11 deletions e2e/testcases/namespace_repo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
"kpt.dev/configsync/e2e/nomostest/testwatcher"
"kpt.dev/configsync/pkg/api/configmanagement"
v1 "kpt.dev/configsync/pkg/api/configmanagement/v1"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/api/configsync/v1beta1"
Expand Down Expand Up @@ -345,6 +344,7 @@ func checkRepoSyncResourcesNotPresent(nt *nomostest.NT, namespace string, secret

func TestDeleteNamespaceReconcilerDeployment(t *testing.T) {
bsNamespace := "bookstore"
rootSyncNN := nomostest.RootSyncNN(configsync.RootSyncName)
repoSyncNN := nomostest.RepoSyncNN(bsNamespace, configsync.RepoSyncName)
nt := nomostest.New(
t,
Expand All @@ -353,7 +353,6 @@ func TestDeleteNamespaceReconcilerDeployment(t *testing.T) {
ntopts.WithCentralizedControl,
)

rootReconciler := core.RootReconcilerName(configsync.RootSyncName)
nsReconciler := core.NsReconcilerName(bsNamespace, configsync.RepoSyncName)

// Validate status condition "Reconciling" and Stalled is set to "False" after
Expand Down Expand Up @@ -391,27 +390,22 @@ func TestDeleteNamespaceReconcilerDeployment(t *testing.T) {
nt.T.Errorf("RepoSync did not finish reconciling: %v", err)
}

rootSyncReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
rootReconciler, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}
repoSyncReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nsReconciler, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
repoSyncLabels, err := nomostest.MetricLabelsForRepoSync(nt, repoSyncNN)
if err != nil {
nt.T.Fatal(err)
}

rootCommitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)
nnCommitHash := nt.NonRootRepos[repoSyncNN].MustHash(nt.T)

// Skip sync & ops metrics and just validate reconciler-manager and reconciler errors.
err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerManagerMetrics(nt),
nomostest.ReconcilerErrorMetrics(nt, rootSyncReconcilerPod.Name, rootCommitHash, metrics.ErrorSummary{}),
nomostest.ReconcilerErrorMetrics(nt, repoSyncReconcilerPod.Name, nnCommitHash, metrics.ErrorSummary{}))
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, rootCommitHash, metrics.ErrorSummary{}),
nomostest.ReconcilerErrorMetrics(nt, repoSyncLabels, nnCommitHash, metrics.ErrorSummary{}))
if err != nil {
nt.T.Fatal(err)
}
Expand Down
13 changes: 5 additions & 8 deletions e2e/testcases/namespaces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
"kpt.dev/configsync/e2e/nomostest/ntopts"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/e2e/nomostest/testpredicates"
"kpt.dev/configsync/pkg/api/configmanagement"
"kpt.dev/configsync/pkg/api/configsync"
"kpt.dev/configsync/pkg/core"
"kpt.dev/configsync/pkg/kinds"
Expand Down Expand Up @@ -725,21 +724,19 @@ func TestDontDeleteAllNamespaces(t *testing.T) {
safetyNSObj := fake.NamespaceObject(nt.RootRepos[configsync.RootSyncName].SafetyNSName)
nt.MetricsExpectations.RemoveObject(configsync.RootSyncKind, rootSyncNN, safetyNSObj)

rootReconcilerPod, err := nt.KubeClient.GetDeploymentPod(
nomostest.DefaultRootReconcilerName, configmanagement.ControllerNamespace,
nt.DefaultWaitTimeout)
rootSyncLabels, err := nomostest.MetricLabelsForRootSync(nt, rootSyncNN)
if err != nil {
nt.T.Fatal(err)
}
commitHash := nt.RootRepos[configsync.RootSyncName].MustHash(nt.T)

err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerSyncError(nt, rootReconcilerPod.Name, commitHash),
nomostest.ReconcilerSourceMetrics(nt, rootReconcilerPod.Name, commitHash,
nomostest.ReconcilerSyncError(nt, rootSyncLabels, commitHash),
nomostest.ReconcilerSourceMetrics(nt, rootSyncLabels, commitHash,
nt.MetricsExpectations.ExpectedRootSyncObjectCount(configsync.RootSyncName)),
nomostest.ReconcilerOperationsMetrics(nt, rootReconcilerPod.Name,
nomostest.ReconcilerOperationsMetrics(nt, rootSyncLabels,
nt.MetricsExpectations.ExpectedRootSyncObjectOperations(configsync.RootSyncName)...),
nomostest.ReconcilerErrorMetrics(nt, rootReconcilerPod.Name, commitHash, metrics.ErrorSummary{
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, commitHash, metrics.ErrorSummary{
Sync: 1,
}))
if err != nil {
Expand Down
Loading

0 comments on commit ca0b4cb

Please sign in to comment.