From 55df82b9fa5011efae8fa7fb2de458222a159598 Mon Sep 17 00:00:00 2001 From: Catherine Fang Date: Wed, 10 Jan 2024 09:36:54 -0500 Subject: [PATCH] Fix OOM issue and "http2: stream closed" issue by only returning one item for ListCustomMetrics --- custom-metrics-stackdriver-adapter/Makefile | 2 +- custom-metrics-stackdriver-adapter/adapter.go | 24 ++++++++++++++++++- .../pkg/adapter/provider/provider.go | 23 ++++-------------- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/custom-metrics-stackdriver-adapter/Makefile b/custom-metrics-stackdriver-adapter/Makefile index f8318ab66..8a0373d2b 100644 --- a/custom-metrics-stackdriver-adapter/Makefile +++ b/custom-metrics-stackdriver-adapter/Makefile @@ -3,7 +3,7 @@ GOOS?=linux OUT_DIR?=build PACKAGE=github.com/GoogleCloudPlatform/k8s-stackdriver/custom-metrics-stackdriver-adapter PREFIX?=staging-k8s.gcr.io -TAG = v0.13.1 +TAG = v0.14.0 PKG := $(shell find pkg/* -type f) .PHONY: build docker push test clean diff --git a/custom-metrics-stackdriver-adapter/adapter.go b/custom-metrics-stackdriver-adapter/adapter.go index e72f70db6..f68f9d75f 100644 --- a/custom-metrics-stackdriver-adapter/adapter.go +++ b/custom-metrics-stackdriver-adapter/adapter.go @@ -65,6 +65,9 @@ type stackdriverAdapterServerOptions struct { // EnableDistributionSupport is a flag that indicates whether or not to allow distributions can // be used (with special reducer labels) in the adapter EnableDistributionSupport bool + // ListFullCustomMetrics is a flag that whether list all pod custom metrics during api discovery. + // Default = false, which only list 1 metric. Enabling this back would increase memory usage. + ListFullCustomMetrics bool } func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptions, rateInterval time.Duration, alignmentPeriod time.Duration) (provider.MetricsProvider, *translator.Translator) { @@ -110,7 +113,19 @@ func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptio conf.GenericConfig.EnableMetrics = true translator := translator.NewTranslator(stackdriverService, gceConf, rateInterval, alignmentPeriod, mapper, o.UseNewResourceModel, o.EnableDistributionSupport) - return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics), translator + + // If ListFullCustomMetrics is false, it returns one resource during api discovery `kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta2"` to reduce memory usage. + stackdriverRequest := translator.ListMetricDescriptors(o.FallbackForContainerMetrics) + response, err := stackdriverRequest.Do() + if err != nil { + klog.Fatalf("Failed request to stackdriver api: %s", err) + } + customMetricsListCache := translator.GetMetricsFromSDDescriptorsResp(response) + if !o.ListFullCustomMetrics && len(customMetricsListCache) > 0 { + customMetricsListCache = customMetricsListCache[0:1] + } + + return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics, customMetricsListCache), translator } func (sa *StackdriverAdapter) withCoreMetrics(translator *translator.Translator) error { @@ -154,6 +169,7 @@ func main() { FallbackForContainerMetrics: false, EnableCoreMetricsAPI: false, EnableDistributionSupport: false, + ListFullCustomMetrics: false, } flags.BoolVar(&serverOptions.UseNewResourceModel, "use-new-resource-model", serverOptions.UseNewResourceModel, @@ -166,6 +182,8 @@ func main() { "If true, fallbacks to k8s_container resource when given metric is not present on k8s_pod. At most one container with given metric is allowed for each pod.") flags.BoolVar(&serverOptions.EnableCoreMetricsAPI, "enable-core-metrics-api", serverOptions.EnableCoreMetricsAPI, "Experimental, do not use. Whether to enable Core Metrics API.") + flags.BoolVar(&serverOptions.ListFullCustomMetrics, "list-full-custom-metrics", serverOptions.ListFullCustomMetrics, + "whether to supporting list full custom metrics. This is a featuragate to list full custom metrics back, which should keep as false to return only 1 metric. Otherwise, it would have high memory usage issue.") flags.StringVar(&serverOptions.MetricsAddress, "metrics-address", "", "Endpoint with port on which Prometheus metrics server should be enabled. Example: localhost:8080. If there is no flag, Prometheus metric server is disabled and monitoring metrics are not collected.") flags.StringVar(&serverOptions.StackdriverEndpoint, "stackdriver-endpoint", "", @@ -175,12 +193,16 @@ func main() { flags.Parse(os.Args) + klog.Info("serverOptions: ", serverOptions) if !serverOptions.UseNewResourceModel && serverOptions.FallbackForContainerMetrics { klog.Fatalf("Container metrics work only with new resource model") } if !serverOptions.UseNewResourceModel && serverOptions.EnableCoreMetricsAPI { klog.Fatalf("Core metrics work only with new resource model") } + if serverOptions.ListFullCustomMetrics { + klog.Infof("ListFullCustomMetrics is enabled, which would increase memory usage a lot. Please keep it as false, unless have to.") + } // TODO(holubwicz): move duration config to server options metricsProvider, translator := cmd.makeProviderOrDie(&serverOptions, 5*time.Minute, 1*time.Minute) diff --git a/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go b/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go index c3fcf52e3..f573406db 100644 --- a/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go +++ b/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "strings" - "sync" "time" "github.com/GoogleCloudPlatform/k8s-stackdriver/custom-metrics-stackdriver-adapter/pkg/adapter/translator" @@ -55,15 +54,13 @@ type StackdriverProvider struct { rateInterval time.Duration translator *translator.Translator useNewResourceModel bool - mu sync.Mutex - metricsCacheSet bool metricsCache []provider.CustomMetricInfo fallbackForContainerMetrics bool } // NewStackdriverProvider creates a StackdriverProvider -func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool) provider.MetricsProvider { - return &StackdriverProvider{ +func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool, customMetricsListCache []provider.CustomMetricInfo) provider.MetricsProvider { + p := &StackdriverProvider{ kubeClient: kubeClient, stackdriverService: stackdriverService, config: gceConf, @@ -71,7 +68,10 @@ func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.REST translator: translator, useNewResourceModel: useNewResourceModel, fallbackForContainerMetrics: fallbackForContainerMetrics, + metricsCache: customMetricsListCache, } + + return p } // GetMetricByName fetches a particular metric for a particular object. @@ -309,20 +309,7 @@ func (p *StackdriverProvider) getNamespacedMetricBySelector(groupResource schema } // ListAllMetrics returns all custom metrics available from Stackdriver. -// List only pod metrics func (p *StackdriverProvider) ListAllMetrics() []provider.CustomMetricInfo { - p.mu.Lock() - defer p.mu.Unlock() - if !p.metricsCacheSet { - stackdriverRequest := p.translator.ListMetricDescriptors(p.fallbackForContainerMetrics) - response, err := stackdriverRequest.Do() - if err != nil { - klog.Errorf("Failed request to stackdriver api: %s", err) - return []provider.CustomMetricInfo{} - } - p.metricsCacheSet = true - p.metricsCache = p.translator.GetMetricsFromSDDescriptorsResp(response) - } return p.metricsCache }