Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ci neuron #1

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8920742
add dcgm exporter scraper and move prometheus scraper test mock to mo…
movence Feb 2, 2024
e0f5bcd
update emf exporter to handle GPU metrics with different metric types
movence Feb 13, 2024
51fe859
remove custom logic in emf exporter
movence Feb 14, 2024
cee2244
update gpu flag comment
movence Feb 14, 2024
3debd28
remove comments and test codes
movence Feb 14, 2024
3d9de49
add neuron monitor scraper
sam6134 Feb 15, 2024
9e70069
remove unused codes and rename scraper init funcs
movence Feb 16, 2024
b9a0e03
remove comments
movence Feb 16, 2024
52cd972
add changelog for gpu
movence Feb 16, 2024
eeb90e2
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 19, 2024
1747a50
Update Scraper for new metrics
sam6134 Feb 20, 2024
4f0e3e1
Make Neuron Scraper extension for simple prometheus scraper
sam6134 Feb 23, 2024
c95f590
Minor fixes
sam6134 Feb 23, 2024
609198b
EnableFlag default to false
sam6134 Feb 23, 2024
1444acd
add gpu metric consumer that uses k8s decorator for attributes
movence Feb 26, 2024
a89378e
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 27, 2024
d2c417d
testing support
sam6134 Mar 1, 2024
00a12dc
debugging pod resources store
aditya-purang Mar 1, 2024
d3bf111
Add dcgm scraper to collect nvidia GPU metrics (#160)
movence Mar 1, 2024
622200a
[internal/aws/proxy] Fix proxy server unit test (#177)
jefchien Mar 1, 2024
9cb314e
Adding default TLS to dcgmscraper (#178)
okankoAMZ Mar 1, 2024
a821803
add podresource scrapper and metric data printer
aditya-purang Mar 4, 2024
83896ab
refactor logMd
aditya-purang Mar 4, 2024
69969dd
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Mar 4, 2024
3267653
Merge conflicts
sam6134 Mar 4, 2024
164bd84
More cleanups
sam6134 Mar 4, 2024
19223b1
Remove unused imports
sam6134 Mar 4, 2024
c65ad64
Add decorator to neuron scraper
sam6134 Mar 4, 2024
1f60d15
Merge branch 'ci-neuron' into docker-testing
sam6134 Mar 4, 2024
c6966db
Add decorator to add podResources
sam6134 Mar 5, 2024
05b1c75
Unified the decorator and added podResources decorator
sam6134 Mar 5, 2024
76e05aa
Minor fixes
sam6134 Mar 5, 2024
9e2f849
remove unused file
sam6134 Mar 5, 2024
3168bb2
Making Dcgm implement SimplePrometheusScraper
sam6134 Mar 6, 2024
0c8eac2
Merge branch 'ci-neuron' into ci-neuron
sam6134 Mar 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .chloggen-aws/nvidia-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'enhancement'

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: containerinsightsreceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: "Adds DCGM scraper to collect NVIDIA GPU metrics"

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [160]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: Supports NVIDIA GPU metrics by adding a new prometheus data scraper in a k8s environment. The new scraper |
relabels the default DCGM labels into existing Container Insights labels.

# e.g. '[aws]'
# Include 'aws' if the change is done by cwa
# Default: '[user]'
change_logs: [aws]
4 changes: 3 additions & 1 deletion exporter/awsemfexporter/metric_translator.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ func (mt metricTranslator) translateOTelToGroupedMetric(rm pmetric.ResourceMetri
}

if serviceName, ok := rm.Resource().Attributes().Get("service.name"); ok {
if strings.HasPrefix(serviceName.Str(), "containerInsightsKubeAPIServerScraper") {
if strings.HasPrefix(serviceName.Str(), "containerInsightsKubeAPIServerScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsNeuronMonitorScraper") {
// the prometheus metrics that come from the container insight receiver need to be clearly tagged as coming from container insights
metricReceiver = containerInsightsReceiver
}
Expand Down
36 changes: 34 additions & 2 deletions exporter/awsemfexporter/metric_translator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,10 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
// need to have 1 more metric than the default because the first is not going to be retained because it is a delta metric
containerInsightMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
containerInsightMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsKubeAPIServerScraper")
gpuMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
gpuMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsDCGMExporterScraper")
neuronMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
neuronMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsNeuronMonitorScraper")

counterSumMetrics := map[string]*metricInfo{
"spanCounter": {
Expand Down Expand Up @@ -368,12 +372,40 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
"spanName": "testSpan",
},
map[string]string{
(oTellibDimensionKey): "cloudwatch-lib",
"spanName": "testSpan",
oTellibDimensionKey: "cloudwatch-lib",
"spanName": "testSpan",
},
"myServiceNS/containerInsightsKubeAPIServerScraper",
containerInsightsReceiver,
},
{
"dcgm receiver",
gpuMetric,
map[string]string{
"isItAnError": "false",
"spanName": "testSpan",
},
map[string]string{
oTellibDimensionKey: "cloudwatch-lib",
"spanName": "testSpan",
},
"myServiceNS/containerInsightsDCGMExporterScraper",
containerInsightsReceiver,
},
{
"neuron monitor receiver",
neuronMetric,
map[string]string{
"isItAnError": "false",
"spanName": "testSpan",
},
map[string]string{
oTellibDimensionKey: "cloudwatch-lib",
"spanName": "testSpan",
},
"myServiceNS/containerInsightsNeuronMonitorScraper",
containerInsightsReceiver,
},
}

for _, tc := range testCases {
Expand Down
7 changes: 7 additions & 0 deletions receiver/awscontainerinsightreceiver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,11 @@ type Config struct {
// EnableControlPlaneMetrics enables additional metrics sourced from the Kubernetes API server /metrics prometheus endpoint
// The default value is false.
EnableControlPlaneMetrics bool `mapstructure:"enable_control_plane_metrics"`

// EnableGpuMetric toggles GPU monitoring where metrics are scraped from vendor specific sources
EnableGpuMetric bool `mapstructure:"gpu_metrics"`

// EnableNeuronMetric toggles Neuron monitoring where metrics are scraped from neuron monitor
// The default value is false.
EnableNeuronMetric bool `mapstructure:"neuron_metrics"`
sam6134 marked this conversation as resolved.
Show resolved Hide resolved
}
4 changes: 4 additions & 0 deletions receiver/awscontainerinsightreceiver/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ const (

// Don't enable EKS control plane metrics by default
defaultEnableControlPlaneMetrics = false

// Don't enable Neuron metrics by default
defaultEnableNeuronMetrics = false
)

// NewFactory creates a factory for AWS container insight receiver
Expand All @@ -64,6 +67,7 @@ func createDefaultConfig() component.Config {
ClusterName: defaultClusterName,
LeaderLockName: defaultLeaderLockName,
EnableControlPlaneMetrics: defaultEnableControlPlaneMetrics,
EnableNeuronMetric: defaultEnableNeuronMetrics,
}
}

Expand Down
219 changes: 219 additions & 0 deletions receiver/awscontainerinsightreceiver/internal/gpu/dcgmscraper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package gpu

import (
"context"
"errors"
"fmt"
"time"

"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
"github.com/prometheus/prometheus/discovery/kubernetes"
"github.com/prometheus/prometheus/model/relabel"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/pdata/pmetric"
"go.opentelemetry.io/collector/receiver"
"go.uber.org/zap"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
)

const (
caFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
collectionInterval = 60 * time.Second
jobName = "containerInsightsDCGMExporterScraper"
)

type DcgmScraper struct {
ctx context.Context
settings component.TelemetrySettings
host component.Host
hostInfoProvider hostInfoProvider
prometheusReceiver receiver.Metrics
running bool
}

type DcgmScraperOpts struct {
Ctx context.Context
TelemetrySettings component.TelemetrySettings
Consumer consumer.Metrics
Host component.Host
HostInfoProvider hostInfoProvider
}

type hostInfoProvider interface {
GetClusterName() string
GetInstanceID() string
}

func NewDcgmScraper(opts DcgmScraperOpts) (*DcgmScraper, error) {
if opts.Consumer == nil {
return nil, errors.New("consumer cannot be nil")
}
if opts.Host == nil {
return nil, errors.New("host cannot be nil")
}
if opts.HostInfoProvider == nil {
return nil, errors.New("cluster name provider cannot be nil")
}

scrapeConfig := &config.ScrapeConfig{
ScrapeInterval: model.Duration(collectionInterval),
ScrapeTimeout: model.Duration(collectionInterval),
JobName: jobName,
Scheme: "http",
MetricsPath: "/metrics",
ServiceDiscoveryConfigs: discovery.Configs{
&kubernetes.SDConfig{
Role: kubernetes.RoleService,
NamespaceDiscovery: kubernetes.NamespaceDiscovery{
IncludeOwnNamespace: true,
},
Selectors: []kubernetes.SelectorConfig{
{
Role: kubernetes.RoleService,
Label: "k8s-app=dcgm-exporter-service",
},
},
AttachMetadata: kubernetes.AttachMetadataConfig{
Node: true,
},
},
},
RelabelConfigs: []*relabel.Config{
{
SourceLabels: model.LabelNames{"__address__"},
Regex: relabel.MustNewRegexp("([^:]+)(?::\\d+)?"),
Replacement: "${1}:9400",
TargetLabel: "__address__",
Action: relabel.Replace,
},
},
MetricRelabelConfigs: []*relabel.Config{
{
SourceLabels: model.LabelNames{"__name__"},
Regex: relabel.MustNewRegexp("DCGM_.*"),
Action: relabel.Keep,
},
{
SourceLabels: model.LabelNames{"Hostname"},
TargetLabel: "NodeName",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: "Namespace",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
// hacky way to inject static values (clusterName & instanceId) to label set without additional processor
// relabel looks up an existing label then creates another label with given key (TargetLabel) and value (static)
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: "ClusterName",
Regex: relabel.MustNewRegexp(".*"),
Replacement: opts.HostInfoProvider.GetClusterName(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: "InstanceId",
Regex: relabel.MustNewRegexp(".*"),
Replacement: opts.HostInfoProvider.GetInstanceID(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"pod"},
TargetLabel: "FullPodName",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"pod"},
TargetLabel: "PodName",
Regex: relabel.MustNewRegexp("(.+)-(.+)"),
Replacement: "${1}",
Action: relabel.Replace,
},
// additional k8s podname for service name decoration
{
SourceLabels: model.LabelNames{"pod"},
TargetLabel: "K8sPodName",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"container"},
TargetLabel: "ContainerName",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"device"},
TargetLabel: "GpuDevice",
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
},
}

promConfig := prometheusreceiver.Config{
PrometheusConfig: &config.Config{
ScrapeConfigs: []*config.ScrapeConfig{scrapeConfig},
},
}

params := receiver.CreateSettings{
TelemetrySettings: opts.TelemetrySettings,
}

promFactory := prometheusreceiver.NewFactory()
promReceiver, err := promFactory.CreateMetricsReceiver(opts.Ctx, params, &promConfig, opts.Consumer)
if err != nil {
return nil, fmt.Errorf("failed to create prometheus receiver: %w", err)
}

return &DcgmScraper{
ctx: opts.Ctx,
settings: opts.TelemetrySettings,
host: opts.Host,
hostInfoProvider: opts.HostInfoProvider,
prometheusReceiver: promReceiver,
}, nil
}

func (ds *DcgmScraper) GetMetrics() []pmetric.Metrics {
// This method will never return metrics because the metrics are collected by the scraper.
// This method will ensure the scraper is running
if !ds.running {
ds.settings.Logger.Info("The scraper is not running, starting up the scraper")
err := ds.prometheusReceiver.Start(ds.ctx, ds.host)
if err != nil {
ds.settings.Logger.Error("Unable to start PrometheusReceiver", zap.Error(err))
}
ds.running = err == nil
}
return nil
}

func (ds *DcgmScraper) Shutdown() {
if ds.running {
err := ds.prometheusReceiver.Shutdown(ds.ctx)
if err != nil {
ds.settings.Logger.Error("Unable to shutdown PrometheusReceiver", zap.Error(err))
}
ds.running = false
}
}
Loading