Skip to content

Commit

Permalink
Fix OOM issue and "http2: stream closed" issue by returning empty Lis…
Browse files Browse the repository at this point in the history
…tCustomMetrics
  • Loading branch information
CatherineF-dev committed Jan 10, 2024
1 parent 8d6ceab commit 5a3bd71
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 4 deletions.
2 changes: 1 addition & 1 deletion custom-metrics-stackdriver-adapter/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ GOOS?=linux
OUT_DIR?=build
PACKAGE=github.com/GoogleCloudPlatform/k8s-stackdriver/custom-metrics-stackdriver-adapter
PREFIX?=staging-k8s.gcr.io
TAG = v0.13.1
TAG = v0.14.0
PKG := $(shell find pkg/* -type f)

.PHONY: build docker push test clean
Expand Down
11 changes: 10 additions & 1 deletion custom-metrics-stackdriver-adapter/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ type stackdriverAdapterServerOptions struct {
// EnableDistributionSupport is a flag that indicates whether or not to allow distributions can
// be used (with special reducer labels) in the adapter
EnableDistributionSupport bool
// SupportListCustomMetrics would list all pod custom metrics, default = false. Enabling this back would increase memory usage.
SupportListCustomMetrics bool
}

func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptions, rateInterval time.Duration, alignmentPeriod time.Duration) (provider.MetricsProvider, *translator.Translator) {
Expand Down Expand Up @@ -110,7 +112,7 @@ func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptio
conf.GenericConfig.EnableMetrics = true

translator := translator.NewTranslator(stackdriverService, gceConf, rateInterval, alignmentPeriod, mapper, o.UseNewResourceModel, o.EnableDistributionSupport)
return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics), translator
return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics, o.SupportListCustomMetrics), translator
}

func (sa *StackdriverAdapter) withCoreMetrics(translator *translator.Translator) error {
Expand Down Expand Up @@ -154,6 +156,7 @@ func main() {
FallbackForContainerMetrics: false,
EnableCoreMetricsAPI: false,
EnableDistributionSupport: false,
SupportListCustomMetrics: false,
}

flags.BoolVar(&serverOptions.UseNewResourceModel, "use-new-resource-model", serverOptions.UseNewResourceModel,
Expand All @@ -166,6 +169,8 @@ func main() {
"If true, fallbacks to k8s_container resource when given metric is not present on k8s_pod. At most one container with given metric is allowed for each pod.")
flags.BoolVar(&serverOptions.EnableCoreMetricsAPI, "enable-core-metrics-api", serverOptions.EnableCoreMetricsAPI,
"Experimental, do not use. Whether to enable Core Metrics API.")
flags.BoolVar(&serverOptions.SupportListCustomMetrics, "support-list-custom-metrics", serverOptions.SupportListCustomMetrics,
"whether to supporting list custom metrics. This is a featuragate to enable listing custom metrics back, which should keep as false. Otherwise, it would have high memory usage and timeout error log.")
flags.StringVar(&serverOptions.MetricsAddress, "metrics-address", "",
"Endpoint with port on which Prometheus metrics server should be enabled. Example: localhost:8080. If there is no flag, Prometheus metric server is disabled and monitoring metrics are not collected.")
flags.StringVar(&serverOptions.StackdriverEndpoint, "stackdriver-endpoint", "",
Expand All @@ -175,12 +180,16 @@ func main() {

flags.Parse(os.Args)

klog.Info("serverOptions: ", serverOptions)
if !serverOptions.UseNewResourceModel && serverOptions.FallbackForContainerMetrics {
klog.Fatalf("Container metrics work only with new resource model")
}
if !serverOptions.UseNewResourceModel && serverOptions.EnableCoreMetricsAPI {
klog.Fatalf("Core metrics work only with new resource model")
}
if serverOptions.SupportListCustomMetrics {
klog.Infof("SupportListCustomMetrics is enabled, which would increase memory usage a lot. Please keep it as false, unless it's needed.")
}

// TODO(holubwicz): move duration config to server options
metricsProvider, translator := cmd.makeProviderOrDie(&serverOptions, 5*time.Minute, 1*time.Minute)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,11 @@ type StackdriverProvider struct {
metricsCacheSet bool
metricsCache []provider.CustomMetricInfo
fallbackForContainerMetrics bool
supportListCustomMetrics bool
}

// NewStackdriverProvider creates a StackdriverProvider
func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool) provider.MetricsProvider {
func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool, supportListCustomMetrics bool) provider.MetricsProvider {
return &StackdriverProvider{
kubeClient: kubeClient,
stackdriverService: stackdriverService,
Expand All @@ -71,6 +72,7 @@ func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.REST
translator: translator,
useNewResourceModel: useNewResourceModel,
fallbackForContainerMetrics: fallbackForContainerMetrics,
supportListCustomMetrics: supportListCustomMetrics,
}
}

Expand Down Expand Up @@ -309,8 +311,13 @@ func (p *StackdriverProvider) getNamespacedMetricBySelector(groupResource schema
}

// ListAllMetrics returns all custom metrics available from Stackdriver.
// List only pod metrics
func (p *StackdriverProvider) ListAllMetrics() []provider.CustomMetricInfo {
// This can reduce memory usage significantly and ListAllMetrics is not in HPA.
if !p.supportListCustomMetrics {
return []provider.CustomMetricInfo{}
}

// List only pod metrics
p.mu.Lock()
defer p.mu.Unlock()
if !p.metricsCacheSet {
Expand Down

0 comments on commit 5a3bd71

Please sign in to comment.