Skip to content

Commit

Permalink
Fix OOM issue and "http2: stream closed" issue by returning empty Lis…
Browse files Browse the repository at this point in the history
…tCustomMetrics
  • Loading branch information
CatherineF-dev committed Jan 10, 2024
1 parent 8d6ceab commit 6d08ee5
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
2 changes: 1 addition & 1 deletion custom-metrics-stackdriver-adapter/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ GOOS?=linux
OUT_DIR?=build
PACKAGE=github.com/GoogleCloudPlatform/k8s-stackdriver/custom-metrics-stackdriver-adapter
PREFIX?=staging-k8s.gcr.io
TAG = v0.13.1
TAG = v0.14.0
PKG := $(shell find pkg/* -type f)

.PHONY: build docker push test clean
Expand Down
6 changes: 5 additions & 1 deletion custom-metrics-stackdriver-adapter/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptio
conf.GenericConfig.EnableMetrics = true

translator := translator.NewTranslator(stackdriverService, gceConf, rateInterval, alignmentPeriod, mapper, o.UseNewResourceModel, o.EnableDistributionSupport)
return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics), translator
return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics, o.SupportListCustomMetrics), translator
}

func (sa *StackdriverAdapter) withCoreMetrics(translator *translator.Translator) error {
Expand Down Expand Up @@ -154,6 +154,7 @@ func main() {
FallbackForContainerMetrics: false,
EnableCoreMetricsAPI: false,
EnableDistributionSupport: false,
SupportListCustomMetrics: false,
}

flags.BoolVar(&serverOptions.UseNewResourceModel, "use-new-resource-model", serverOptions.UseNewResourceModel,
Expand All @@ -166,6 +167,8 @@ func main() {
"If true, fallbacks to k8s_container resource when given metric is not present on k8s_pod. At most one container with given metric is allowed for each pod.")
flags.BoolVar(&serverOptions.EnableCoreMetricsAPI, "enable-core-metrics-api", serverOptions.EnableCoreMetricsAPI,
"Experimental, do not use. Whether to enable Core Metrics API.")
flags.BoolVar(&serverOptions.SupportListCustomMetrics, "support-list-custom-metrics", serverOptions.SupportListCustomMetrics,
"whether to supporting list custom metrics. This is a featuragate to enable listing custom metrics back, which should keep as false. Otherwise, it would have high memory usage and timeout error log.")
flags.StringVar(&serverOptions.MetricsAddress, "metrics-address", "",
"Endpoint with port on which Prometheus metrics server should be enabled. Example: localhost:8080. If there is no flag, Prometheus metric server is disabled and monitoring metrics are not collected.")
flags.StringVar(&serverOptions.StackdriverEndpoint, "stackdriver-endpoint", "",
Expand All @@ -175,6 +178,7 @@ func main() {

flags.Parse(os.Args)

klog.Info("serverOptions: ", serverOptions)
if !serverOptions.UseNewResourceModel && serverOptions.FallbackForContainerMetrics {
klog.Fatalf("Container metrics work only with new resource model")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,11 @@ type StackdriverProvider struct {
metricsCacheSet bool
metricsCache []provider.CustomMetricInfo
fallbackForContainerMetrics bool
supportListCustomMetrics bool
}

// NewStackdriverProvider creates a StackdriverProvider
func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool) provider.MetricsProvider {
func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool, supportListCustomMetrics bool) provider.MetricsProvider {
return &StackdriverProvider{
kubeClient: kubeClient,
stackdriverService: stackdriverService,
Expand All @@ -71,6 +72,7 @@ func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.REST
translator: translator,
useNewResourceModel: useNewResourceModel,
fallbackForContainerMetrics: fallbackForContainerMetrics,
supportListCustomMetrics: supportListCustomMetricsm,
}
}

Expand Down Expand Up @@ -309,8 +311,13 @@ func (p *StackdriverProvider) getNamespacedMetricBySelector(groupResource schema
}

// ListAllMetrics returns all custom metrics available from Stackdriver.
// List only pod metrics
func (p *StackdriverProvider) ListAllMetrics() []provider.CustomMetricInfo {
// This can reduce memory usage significantly and ListAllMetrics is not in HPA.
if !p.supportListCustomMetric {
return []provider.CustomMetricInfo{}
}

// List only pod metrics
p.mu.Lock()
defer p.mu.Unlock()
if !p.metricsCacheSet {
Expand Down

0 comments on commit 6d08ee5

Please sign in to comment.