Skip to content

Commit

Permalink
Add example otel-collector configurations
Browse files Browse the repository at this point in the history
  • Loading branch information
tiffanny29631 committed Mar 5, 2024
1 parent 64e8f7e commit 7dff4c0
Show file tree
Hide file tree
Showing 4 changed files with 408 additions and 0 deletions.
183 changes: 183 additions & 0 deletions e2e/testcases/otel_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package e2e
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"time"
Expand All @@ -28,12 +30,14 @@ import (
"google.golang.org/api/iterator"
"google.golang.org/genproto/googleapis/api/metric"
"google.golang.org/genproto/googleapis/api/monitoredres"
"gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"kpt.dev/configsync/e2e"
"kpt.dev/configsync/e2e/nomostest"
"kpt.dev/configsync/e2e/nomostest/iam"
testmetrics "kpt.dev/configsync/e2e/nomostest/metrics"
"kpt.dev/configsync/e2e/nomostest/ntopts"
"kpt.dev/configsync/e2e/nomostest/retry"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
Expand Down Expand Up @@ -212,6 +216,158 @@ func TestOtelCollectorDeployment(t *testing.T) {
}
}

// TestOtelCollectorSampleConfigurations validates that metrics reporting works for
// Google Cloud Monitoring using the sample custom configurations.
//
// Requirements:
// - node identity:
// - node GSA with roles/monitoring.metricWriter IAM
//
// - workload identity:
// - e2e-test-metric-writer GSA with roles/monitoring.metricWriter IAM
// - roles/iam.workloadIdentityUser on config-management-monitoring/default for e2e-test-metric-writer
func TestOtelCollectorSampleConfigurations(t *testing.T) {
nt := nomostest.New(t,
nomostesting.Reconciliation1,
ntopts.RequireGKE(t),
ntopts.Unstructured,
)
nt.T.Cleanup(func() {
if t.Failed() {
nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false)
}
})
setupMetricsServiceAccount(nt)

nt.T.Cleanup(func() {
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found")
nt.T.Log("Restart otel-collector pod to reset the ConfigMap and log")
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Errorf("otel-collector pod failed to come up after a restart: %v", err)
}
})

nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM")
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Fatal(err)
}

startTime := time.Now().UTC()
ctx := nt.Context
client, err := createGCMClient(ctx)
if err != nil {
nt.T.Fatal(err)
}

nt.T.Log("Add the kustomize components root directory to enable kustomize metrics")
nt.Must(nt.RootRepos[configsync.RootSyncName].Copy("../testdata/hydration/kustomize-components", "."))
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("add DRY configs to the repository"))

nt.T.Log("Update RootSync to sync from the kustomize-components directory")
rs := fake.RootSyncObjectV1Beta1(configsync.RootSyncName)
nt.MustMergePatch(rs, `{"spec": {"git": {"dir": "kustomize-components"}}}`)
syncDirMap := map[types.NamespacedName]string{
nomostest.DefaultRootRepoNamespacedName: "kustomize-components",
}
if err := nt.WatchForAllSyncs(nomostest.WithSyncDirectoryMap(syncDirMap)); err != nil {
nt.T.Fatal(err)
}

directory := "../../examples/otel-collector-sample-configurations"
dirEntry, err := os.ReadDir(directory)
if err != nil {
nt.T.Fatal("Error opening directory:", err)
}
for _, entry := range dirEntry {
if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".yaml") {
fileName := entry.Name()
fullPath := filepath.Join(directory, fileName)
nt.T.Log("Apply sample custom otel-collector configuration", fileName)
nt.MustKubectl("apply", "-f", fullPath)

err := nt.Validate(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{})
if err != nil {
nt.T.Fatal(err)
}

content, err := os.ReadFile(fullPath)
if err != nil {
nt.T.Fatal("failed to read file: %v", err)
}

var configMap ConfigMap
if err := yaml.Unmarshal(content, &configMap); err != nil {
nt.T.Fatal("error unmarshalling YAML: %v", err)
}

var otelConfig OtelConfig
if err := yaml.Unmarshal([]byte(configMap.Data.OtelCollectorConfig), &otelConfig); err != nil {
nt.T.Fatal("error: ", err)
}

// check cloud monitoring
_, err = retry.Retry(60*time.Second, func() error {
includGCM := pipelinesInclude("metrics/cloudmonitoring", otelConfig)
for _, metricType := range GCMMetricTypes {
descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType)
it := listMetricInGCM(ctx, nt, client, startTime, descriptor)
if includGCM {
return validateMetricInGCM(nt, it, descriptor, nt.ClusterName)
}
}
return nil
})
if err != nil {
nt.T.Fatal(err)
}

// check prometheus
if pipelinesInclude("metrics/prometheus", otelConfig) {
summary := testmetrics.Summary{
Sync: nomostest.RootSyncNN(configsync.RootSyncName),
}
if _, found := nt.RootRepos[summary.Sync.Name]; !found {
nt.T.Fatal("Rootsync not found", configsync.RootSyncName)
}
commitHash, err := nt.RootRepos[summary.Sync.Name].Hash()
if err != nil {
nt.T.Fatal()
}
syncLabels, err := nomostest.MetricLabelsForRootSync(nt, summary.Sync)
if err != nil {
nt.T.Fatal(err)
}
err = nomostest.ValidateMetrics(nt,
nomostest.ReconcilerSyncSuccess(nt, syncLabels, commitHash),
nomostest.ReconcilerErrorMetrics(nt, syncLabels, commitHash, summary.Errors))
if err != nil {
nt.T.Fatal(err)
}
}

nt.T.Log("Checking the otel-collector log contains no failure...")
err = validateDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption)
if err != nil {
nt.T.Fatal(err)
}

nt.T.Log("Remove sample custom otel-collector configuration %v", fileName)
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found")
err = nt.ValidateNotFoundOrNoMatch(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{})
if err != nil {
nt.T.Fatal(err)
}
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log")
//nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false)
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil {
nt.T.Fatal(err)
}
}
}
}

// TestOtelCollectorGCMLabelAggregation validates that Google Cloud Monitoring
// metrics to ensure that the "commit" label is removed through aggregation in
// the otel-collector config.
Expand Down Expand Up @@ -429,3 +585,30 @@ func validateMetricInGCM(nt *nomostest.NT, it *monitoringv2.TimeSeriesIterator,
return fmt.Errorf("GCM metric %s not found (cluster_name=%s)",
metricType, nt.ClusterName)
}

type ConfigMap struct {
Data struct {
OtelCollectorConfig string `yaml:"otel-collector-config.yaml"`
} `yaml:"data"`
}

type OtelConfig struct {
Service struct {
Pipelines map[string]Pipeline `yaml:"pipelines"`
} `yaml:"service"`
}

type Pipeline struct {
Receivers []string `yaml:"receivers"`
Processors []string `yaml:"processors"`
Exporters []string `yaml:"exporters"`
}

func pipelinesInclude(name string, config OtelConfig) bool {
for pipelineName := range config.Service.Pipelines {
if pipelineName == name {
return true
}
}
return false
}
25 changes: 25 additions & 0 deletions examples/otel-collector-sample-configurations/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Purpose of this directory

This folder provides sample [custom monitoring configurations](http://cloud/anthos-config-management/docs/how-to/monitor-config-sync-custom)
for Config Sync. These examples are intended for your convenience. While Config
Sync strives to keep them updated, always consult the [latest configuration](https://github.com/GoogleContainerTools/kpt-config-sync/blob/main/pkg/metrics/otel.go).

# Available Configurations

_otel-collector-monarch.yaml_: Serves as a template for exporting metrics
exclusively to Google Cloud Monarch and Prometheus.

_otel-collector-prometheus.yaml_: Serves as a template for exporting metrics
exclusively to Prometheus.

# Instructions

* **Apply ConfigMap**: Apply the desired ConfigMap to your cluster.
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, execute:
```
kubectl rollout restart deployment otel-collector -n config-management-monitoring
```
# Removal

* **Delete ConfigMap**: Remove the ConfigMap from your cluster.
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, use the same kubectl rollout restart command as above.
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: opentelemetry
component: otel-collector
configmanagement.gke.io/arch: csmr
configmanagement.gke.io/system: "true"
name: otel-collector-custom
namespace: config-management-monitoring
data:
otel-collector-config.yaml: |-
receivers:
opencensus:
exporters:
prometheus:
endpoint: :8675
namespace: config_sync
resource_to_telemetry_conversion:
enabled: true
googlecloud/kubernetes:
metric:
prefix: "kubernetes.io/internal/addons/config_sync/"
# skip_create_descriptor: Metrics start with 'kubernetes.io/' have already
# got descriptors defined internally. Skip sending dupeicated metric
# descriptors here to prevent errors or conflicts.
skip_create_descriptor: true
# instrumentation_library_labels: Otel Collector by default attaches
# 'instrumentation_version' and 'instrumentation_source' labels that are
# not specified in our Cloud Monarch definitions, thus skipping them here
instrumentation_library_labels: false
# create_service_timeseries: This is a recommended configuration for
# 'service metrics' starts with 'kubernetes.io/' prefix. It uses
# CreateTimeSeries API and has its own quotas, so that custom metric write
# will not break this ingestion pipeline
create_service_timeseries: true
service_resource_labels: false
retry_on_failure:
enabled: false
sending_queue:
enabled: false
processors:
batch:
# resourcedetection: This processor is needed to correctly mirror resource
# labels from OpenCensus to OpenTelemetry. We also want to keep this same
# processor in Otel Agent configuration as the resource labels are added from
# there
resourcedetection:
detectors: [env, gcp]
filter/kubernetes:
metrics:
include:
match_type: regexp
metric_names:
- kustomize.*
- api_duration_seconds
- reconciler_errors
- pipeline_error_observed
- reconcile_duration_seconds
- rg_reconcile_duration_seconds
- parser_duration_seconds
- declared_resources
- apply_operations_total
- apply_duration_seconds
- resource_fights_total
- remediate_duration_seconds
- resource_conflicts_total
- internal_errors_total
- rendering_count_total
- skip_rendering_count_total
- resource_override_count_total
- git_sync_depth_override_count_total
- no_ssl_verify_count_total
- kcc_resource_count
- last_sync_timestamp
# Remove custom configsync metric labels that are not registered with Monarch
# This action applies to all metrics that are sent through the pipeline that
# is using this processor
attributes/kubernetes:
actions:
- key: configsync.sync.kind
action: delete
- key: configsync.sync.name
action: delete
- key: configsync.sync.namespace
action: delete
- key: commit
action: delete
metricstransform/kubernetes:
transforms:
- include: declared_resources
action: update
new_name: current_declared_resources
- include: reconciler_errors
action: update
new_name: last_reconciler_errors
- include: pipeline_error_observed
action: update
new_name: last_pipeline_error_observed
- include: apply_operations_total
action: update
new_name: apply_operations_count
- include: resource_fights_total
action: update
new_name: resource_fights_count
- include: resource_conflicts_total
action: update
new_name: resource_conflicts_count
- include: internal_errors_total
action: update
new_name: internal_errors_count
- include: rendering_count_total
action: update
new_name: rendering_count
- include: skip_rendering_count_total
action: update
new_name: skip_rendering_count
- include: resource_override_count_total
action: update
new_name: resource_override_count
- include: git_sync_depth_override_count_total
action: update
new_name: git_sync_depth_override_count
- include: no_ssl_verify_count_total
action: update
new_name: no_ssl_verify_count
extensions:
health_check:
service:
extensions: [health_check]
pipelines:
metrics/prometheus:
receivers: [opencensus]
processors: [batch]
exporters: [prometheus]
metrics/kubernetes:
receivers: [opencensus]
processors: [batch, filter/kubernetes, attributes/kubernetes, metricstransform/kubernetes, resourcedetection]
exporters: [googlecloud/kubernetes]
Loading

0 comments on commit 7dff4c0

Please sign in to comment.