Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add example otel-collector configurations #1152

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions e2e/testcases/otel_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package e2e
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"time"
Expand All @@ -28,12 +30,14 @@ import (
"google.golang.org/api/iterator"
"google.golang.org/genproto/googleapis/api/metric"
"google.golang.org/genproto/googleapis/api/monitoredres"
"gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"kpt.dev/configsync/e2e"
"kpt.dev/configsync/e2e/nomostest"
"kpt.dev/configsync/e2e/nomostest/iam"
testmetrics "kpt.dev/configsync/e2e/nomostest/metrics"
"kpt.dev/configsync/e2e/nomostest/ntopts"
"kpt.dev/configsync/e2e/nomostest/retry"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
Expand Down Expand Up @@ -212,6 +216,158 @@ func TestOtelCollectorDeployment(t *testing.T) {
}
}

// TestOtelCollectorSampleConfigurations validates that metrics reporting works for
// Google Cloud Monitoring using the sample custom configurations.
//
// Requirements:
// - node identity:
// - node GSA with roles/monitoring.metricWriter IAM
//
// - workload identity:
// - e2e-test-metric-writer GSA with roles/monitoring.metricWriter IAM
// - roles/iam.workloadIdentityUser on config-management-monitoring/default for e2e-test-metric-writer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this anymore with BYOID, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Tweaking the setup now, the tests are passing on failure scenarios, still working on it.

func TestOtelCollectorSampleConfigurations(t *testing.T) {
nt := nomostest.New(t,
nomostesting.Reconciliation1,
ntopts.RequireGKE(t),
ntopts.Unstructured,
)
nt.T.Cleanup(func() {
if t.Failed() {
nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false)
}
})
setupMetricsServiceAccount(nt)

nt.T.Cleanup(func() {
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found")
nt.T.Log("Restart otel-collector pod to reset the ConfigMap and log")
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Errorf("otel-collector pod failed to come up after a restart: %v", err)
}
})

nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM")
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Fatal(err)
}

startTime := time.Now().UTC()
ctx := nt.Context
client, err := createGCMClient(ctx)
if err != nil {
nt.T.Fatal(err)
}

nt.T.Log("Add the kustomize components root directory to enable kustomize metrics")
nt.Must(nt.RootRepos[configsync.RootSyncName].Copy("../testdata/hydration/kustomize-components", "."))
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("add DRY configs to the repository"))

nt.T.Log("Update RootSync to sync from the kustomize-components directory")
rs := fake.RootSyncObjectV1Beta1(configsync.RootSyncName)
nt.MustMergePatch(rs, `{"spec": {"git": {"dir": "kustomize-components"}}}`)
syncDirMap := map[types.NamespacedName]string{
nomostest.DefaultRootRepoNamespacedName: "kustomize-components",
}
if err := nt.WatchForAllSyncs(nomostest.WithSyncDirectoryMap(syncDirMap)); err != nil {
nt.T.Fatal(err)
}

directory := "../../examples/otel-collector-sample-configurations"
dirEntry, err := os.ReadDir(directory)
if err != nil {
nt.T.Fatal("Error opening directory:", err)
}
for _, entry := range dirEntry {
if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".yaml") {
fileName := entry.Name()
fullPath := filepath.Join(directory, fileName)
nt.T.Log("Apply sample custom otel-collector configuration", fileName)
nt.MustKubectl("apply", "-f", fullPath)

err := nt.Validate(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{})
if err != nil {
nt.T.Fatal(err)
}

content, err := os.ReadFile(fullPath)
if err != nil {
nt.T.Fatal("failed to read file: %v", err)
}

var configMap ConfigMap
if err := yaml.Unmarshal(content, &configMap); err != nil {
nt.T.Fatal("error unmarshalling YAML: %v", err)
}

var otelConfig OtelConfig
if err := yaml.Unmarshal([]byte(configMap.Data.OtelCollectorConfig), &otelConfig); err != nil {
nt.T.Fatal("error: ", err)
}

// check cloud monitoring
_, err = retry.Retry(60*time.Second, func() error {
includGCM := pipelinesInclude("metrics/cloudmonitoring", otelConfig)
for _, metricType := range GCMMetricTypes {
descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType)
it := listMetricInGCM(ctx, nt, client, startTime, descriptor)
if includGCM {
return validateMetricInGCM(nt, it, descriptor, nt.ClusterName)
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e2e/testcases/otel_collector_test.go:318:6: SA4004: the surrounding loop is unconditionally terminated (staticcheck)
					return nil
					^

}
return nil
})
if err != nil {
nt.T.Fatal(err)
}

// check prometheus
if pipelinesInclude("metrics/prometheus", otelConfig) {
summary := testmetrics.Summary{
Sync: nomostest.RootSyncNN(configsync.RootSyncName),
}
if _, found := nt.RootRepos[summary.Sync.Name]; !found {
nt.T.Fatal("Rootsync not found", configsync.RootSyncName)
}
commitHash, err := nt.RootRepos[summary.Sync.Name].Hash()
if err != nil {
nt.T.Fatal()
}
syncLabels, err := nomostest.MetricLabelsForRootSync(nt, summary.Sync)
if err != nil {
nt.T.Fatal(err)
}
err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerSyncSuccess(nt, syncLabels, commitHash),
nomostest.ReconcilerErrorMetrics(nt, syncLabels, commitHash, summary.Errors))
if err != nil {
nt.T.Fatal(err)
}
}

nt.T.Log("Checking the otel-collector log contains no failure...")
err = validateDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption)
if err != nil {
nt.T.Fatal(err)
}

nt.T.Log("Remove sample custom otel-collector configuration %v", fileName)
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found")
err = nt.ValidateNotFoundOrNoMatch(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{})
if err != nil {
nt.T.Fatal(err)
}
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log")
//nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Fatal(err)
}
}
}
}

// TestOtelCollectorGCMLabelAggregation validates that Google Cloud Monitoring
// metrics to ensure that the "commit" label is removed through aggregation in
// the otel-collector config.
Expand Down Expand Up @@ -429,3 +585,30 @@ func validateMetricInGCM(nt *nomostest.NT, it *monitoringv2.TimeSeriesIterator,
return fmt.Errorf("GCM metric %s not found (cluster_name=%s)",
metricType, nt.ClusterName)
}

type ConfigMap struct {
Data struct {
OtelCollectorConfig string `yaml:"otel-collector-config.yaml"`
} `yaml:"data"`
}

type OtelConfig struct {
Service struct {
Pipelines map[string]Pipeline `yaml:"pipelines"`
} `yaml:"service"`
}

type Pipeline struct {
Receivers []string `yaml:"receivers"`
Processors []string `yaml:"processors"`
Exporters []string `yaml:"exporters"`
}

func pipelinesInclude(name string, config OtelConfig) bool {
for pipelineName := range config.Service.Pipelines {
if pipelineName == name {
return true
}
}
return false
}
25 changes: 25 additions & 0 deletions examples/otel-collector-sample-configurations/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Purpose of this directory

This folder provides sample [custom monitoring configurations](http://cloud/anthos-config-management/docs/how-to/monitor-config-sync-custom)
for Config Sync. These examples are intended for your convenience. While Config
Sync strives to keep them updated, always consult the [latest configuration](https://github.com/GoogleContainerTools/kpt-config-sync/blob/main/pkg/metrics/otel.go).
janetkuo marked this conversation as resolved.
Show resolved Hide resolved

# Available Configurations

_otel-collector-monarch.yaml_: Serves as a template for exporting metrics
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest bold instead of underline. underline usually implies a link.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or even inline code maybe?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This format is italic, not underline.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which is the usual format when quoting a book name or doc name? I assume.

exclusively to Google Cloud Monarch and Prometheus.

_otel-collector-prometheus.yaml_: Serves as a template for exporting metrics
exclusively to Prometheus.

# Instructions

* **Apply ConfigMap**: Apply the desired ConfigMap to your cluster.
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, execute:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does it know to restart?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The controller looks for a change in annotation and restarts the pod.

```
kubectl rollout restart deployment otel-collector -n config-management-monitoring
```
# Removal

* **Delete ConfigMap**: Remove the ConfigMap from your cluster.
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, use the same kubectl rollout restart command as above.
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright 2024 Google LLC
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How will we know to update these when we change the gcloud template? Is there any way we can have a linter that produces this from that and then verify it matches without drift?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer we migrate away from the manual templating asap before implementing too much scripts, i.e. having API for monitoring configuration and filtering. These samples exist as a intermediate solution.

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: opentelemetry
component: otel-collector
configmanagement.gke.io/arch: csmr
configmanagement.gke.io/system: "true"
name: otel-collector-custom
namespace: config-management-monitoring
data:
otel-collector-config.yaml: |-
receivers:
opencensus:
exporters:
prometheus:
endpoint: :8675
namespace: config_sync
resource_to_telemetry_conversion:
enabled: true
googlecloud/kubernetes:
metric:
prefix: "kubernetes.io/internal/addons/config_sync/"
# skip_create_descriptor: Metrics start with 'kubernetes.io/' have already
# got descriptors defined internally. Skip sending dupeicated metric
# descriptors here to prevent errors or conflicts.
skip_create_descriptor: true
# instrumentation_library_labels: Otel Collector by default attaches
# 'instrumentation_version' and 'instrumentation_source' labels that are
# not specified in our Cloud Monarch definitions, thus skipping them here
instrumentation_library_labels: false
# create_service_timeseries: This is a recommended configuration for
# 'service metrics' starts with 'kubernetes.io/' prefix. It uses
# CreateTimeSeries API and has its own quotas, so that custom metric write
# will not break this ingestion pipeline
create_service_timeseries: true
service_resource_labels: false
retry_on_failure:
enabled: false
sending_queue:
enabled: false
processors:
batch:
# resourcedetection: This processor is needed to correctly mirror resource
# labels from OpenCensus to OpenTelemetry. We also want to keep this same
# processor in Otel Agent configuration as the resource labels are added from
# there
resourcedetection:
detectors: [env, gcp]
filter/kubernetes:
metrics:
include:
match_type: regexp
metric_names:
- kustomize.*
- api_duration_seconds
- reconciler_errors
- pipeline_error_observed
- reconcile_duration_seconds
- rg_reconcile_duration_seconds
- parser_duration_seconds
- declared_resources
- apply_operations_total
- apply_duration_seconds
- resource_fights_total
- remediate_duration_seconds
- resource_conflicts_total
- internal_errors_total
- rendering_count_total
- skip_rendering_count_total
- resource_override_count_total
- git_sync_depth_override_count_total
- no_ssl_verify_count_total
- kcc_resource_count
- last_sync_timestamp
# Remove custom configsync metric labels that are not registered with Monarch
# This action applies to all metrics that are sent through the pipeline that
# is using this processor
attributes/kubernetes:
actions:
- key: configsync.sync.kind
action: delete
- key: configsync.sync.name
action: delete
- key: configsync.sync.namespace
action: delete
- key: commit
action: delete
metricstransform/kubernetes:
transforms:
- include: declared_resources
action: update
new_name: current_declared_resources
- include: reconciler_errors
action: update
new_name: last_reconciler_errors
- include: pipeline_error_observed
action: update
new_name: last_pipeline_error_observed
- include: apply_operations_total
action: update
new_name: apply_operations_count
- include: resource_fights_total
action: update
new_name: resource_fights_count
- include: resource_conflicts_total
action: update
new_name: resource_conflicts_count
- include: internal_errors_total
action: update
new_name: internal_errors_count
- include: rendering_count_total
action: update
new_name: rendering_count
- include: skip_rendering_count_total
action: update
new_name: skip_rendering_count
- include: resource_override_count_total
action: update
new_name: resource_override_count
- include: git_sync_depth_override_count_total
action: update
new_name: git_sync_depth_override_count
- include: no_ssl_verify_count_total
action: update
new_name: no_ssl_verify_count
extensions:
health_check:
service:
extensions: [health_check]
pipelines:
metrics/prometheus:
receivers: [opencensus]
processors: [batch]
exporters: [prometheus]
metrics/kubernetes:
receivers: [opencensus]
processors: [batch, filter/kubernetes, attributes/kubernetes, metricstransform/kubernetes, resourcedetection]
exporters: [googlecloud/kubernetes]
Loading