Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ci neuron #1

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8920742
add dcgm exporter scraper and move prometheus scraper test mock to mo…
movence Feb 2, 2024
e0f5bcd
update emf exporter to handle GPU metrics with different metric types
movence Feb 13, 2024
51fe859
remove custom logic in emf exporter
movence Feb 14, 2024
cee2244
update gpu flag comment
movence Feb 14, 2024
3debd28
remove comments and test codes
movence Feb 14, 2024
3d9de49
add neuron monitor scraper
sam6134 Feb 15, 2024
9e70069
remove unused codes and rename scraper init funcs
movence Feb 16, 2024
b9a0e03
remove comments
movence Feb 16, 2024
52cd972
add changelog for gpu
movence Feb 16, 2024
eeb90e2
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 19, 2024
1747a50
Update Scraper for new metrics
sam6134 Feb 20, 2024
4f0e3e1
Make Neuron Scraper extension for simple prometheus scraper
sam6134 Feb 23, 2024
c95f590
Minor fixes
sam6134 Feb 23, 2024
609198b
EnableFlag default to false
sam6134 Feb 23, 2024
1444acd
add gpu metric consumer that uses k8s decorator for attributes
movence Feb 26, 2024
a89378e
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 27, 2024
d2c417d
testing support
sam6134 Mar 1, 2024
00a12dc
debugging pod resources store
aditya-purang Mar 1, 2024
d3bf111
Add dcgm scraper to collect nvidia GPU metrics (#160)
movence Mar 1, 2024
622200a
[internal/aws/proxy] Fix proxy server unit test (#177)
jefchien Mar 1, 2024
9cb314e
Adding default TLS to dcgmscraper (#178)
okankoAMZ Mar 1, 2024
a821803
add podresource scrapper and metric data printer
aditya-purang Mar 4, 2024
83896ab
refactor logMd
aditya-purang Mar 4, 2024
69969dd
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Mar 4, 2024
3267653
Merge conflicts
sam6134 Mar 4, 2024
164bd84
More cleanups
sam6134 Mar 4, 2024
19223b1
Remove unused imports
sam6134 Mar 4, 2024
c65ad64
Add decorator to neuron scraper
sam6134 Mar 4, 2024
1f60d15
Merge branch 'ci-neuron' into docker-testing
sam6134 Mar 4, 2024
c6966db
Add decorator to add podResources
sam6134 Mar 5, 2024
05b1c75
Unified the decorator and added podResources decorator
sam6134 Mar 5, 2024
76e05aa
Minor fixes
sam6134 Mar 5, 2024
9e2f849
remove unused file
sam6134 Mar 5, 2024
3168bb2
Making Dcgm implement SimplePrometheusScraper
sam6134 Mar 6, 2024
0c8eac2
Merge branch 'ci-neuron' into ci-neuron
sam6134 Mar 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion exporter/awsemfexporter/metric_translator.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ func (mt metricTranslator) translateOTelToGroupedMetric(rm pmetric.ResourceMetri

if serviceName, ok := rm.Resource().Attributes().Get("service.name"); ok {
if strings.HasPrefix(serviceName.Str(), "containerInsightsKubeAPIServerScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") {
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsNeuronMonitorScraper") {
// the prometheus metrics that come from the container insight receiver need to be clearly tagged as coming from container insights
metricReceiver = containerInsightsReceiver
}
Expand Down
16 changes: 16 additions & 0 deletions exporter/awsemfexporter/metric_translator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
containerInsightMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsKubeAPIServerScraper")
gpuMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
gpuMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsDCGMExporterScraper")
neuronMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
neuronMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsNeuronMonitorScraper")

counterSumMetrics := map[string]*metricInfo{
"spanCounter": {
Expand Down Expand Up @@ -390,6 +392,20 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
"myServiceNS/containerInsightsDCGMExporterScraper",
containerInsightsReceiver,
},
{
"neuron monitor receiver",
neuronMetric,
map[string]string{
"isItAnError": "false",
"spanName": "testSpan",
},
map[string]string{
oTellibDimensionKey: "cloudwatch-lib",
"spanName": "testSpan",
},
"myServiceNS/containerInsightsNeuronMonitorScraper",
containerInsightsReceiver,
},
}

for _, tc := range testCases {
Expand Down
11 changes: 6 additions & 5 deletions internal/aws/containerinsight/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,12 @@ const (
TypeContainerDiskIO = "ContainerDiskIO"
// Special type for pause container
// because containerd does not set container name pause container name to POD like docker does.
TypeInfraContainer = "InfraContainer"
TypeGpuContainer = "ContainerGPU"
TypeGpuPod = "PodGPU"
TypeGpuNode = "NodeGPU"
TypeGpuCluster = "ClusterGPU"
TypeInfraContainer = "InfraContainer"
TypeGpuContainer = "ContainerGPU"
TypeGpuPod = "PodGPU"
TypeGpuNode = "NodeGPU"
TypeGpuCluster = "ClusterGPU"
TypeNeuronContainer = "ContainerNeuron"

// unit
UnitBytes = "Bytes"
Expand Down
2 changes: 1 addition & 1 deletion internal/aws/proxy/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ func TestCanCreateTransport(t *testing.T) {

_, err := NewServer(cfg, logger)
assert.Error(t, err, "NewServer should fail")
assert.Contains(t, err.Error(), "failed to parse proxy URL")
assert.Contains(t, err.Error(), "invalid control character in URL")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merge error will fix

}

func TestGetServiceEndpointInvalidAWSConfig(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions receiver/awscontainerinsightreceiver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,9 @@ type Config struct {

// EnableAcceleratedComputeMetrics enabled features with accelerated compute resources where metrics are scraped from vendor specific sources
EnableAcceleratedComputeMetrics bool `mapstructure:"accelerated_compute_metrics"`
EnableGpuMetric bool `mapstructure:"accelerated_compute_metrics"`
sam6134 marked this conversation as resolved.
Show resolved Hide resolved

// EnableNeuronMetric toggles Neuron monitoring where metrics are scraped from neuron monitor
// The default value is false.
EnableNeuronMetric bool `mapstructure:"neuron_metrics"`
sam6134 marked this conversation as resolved.
Show resolved Hide resolved
}
4 changes: 4 additions & 0 deletions receiver/awscontainerinsightreceiver/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ const (

// Don't enable EKS control plane metrics by default
defaultEnableControlPlaneMetrics = false

// Don't enable Neuron metrics by default
defaultEnableNeuronMetrics = false
)

// NewFactory creates a factory for AWS container insight receiver
Expand All @@ -64,6 +67,7 @@ func createDefaultConfig() component.Config {
ClusterName: defaultClusterName,
LeaderLockName: defaultLeaderLockName,
EnableControlPlaneMetrics: defaultEnableControlPlaneMetrics,
EnableNeuronMetric: defaultEnableNeuronMetrics,
}
}

Expand Down
3 changes: 2 additions & 1 deletion receiver/awscontainerinsightreceiver/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ require (
go.opentelemetry.io/collector/receiver v0.89.0
go.uber.org/zap v1.26.0
golang.org/x/exp v0.0.0-20231127185646-65229373498e
google.golang.org/grpc v1.59.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.28.3
k8s.io/apimachinery v0.28.3
k8s.io/client-go v0.28.3
k8s.io/klog v1.0.0
k8s.io/kubelet v0.27.3
k8s.io/utils v0.0.0-20230711102312-30195339c3c7
)

Expand Down Expand Up @@ -210,7 +212,6 @@ require (
google.golang.org/genproto v0.0.0-20231030173426-d783a09b4405 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20231106174013-bbf56f31fb17 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20231030173426-d783a09b4405 // indirect
google.golang.org/grpc v1.59.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions receiver/awscontainerinsightreceiver/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 9 additions & 2 deletions receiver/awscontainerinsightreceiver/internal/gpu/dcgmscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"time"

configutil "github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
Expand All @@ -25,7 +26,7 @@ import (
)

const (
caFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
caFile = "/etc/amazon-cloudwatch-observability-agent-cert/tls-ca.crt"
Comment on lines -28 to +29
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the changes in this file are coming from 9cb314e - can we merge it to ci-nvidia-gpu first so they don't show up as diffs in your PR?

collectionInterval = 60 * time.Second
jobName = "containerInsightsDCGMExporterScraper"
scraperMetricsPath = "/metrics"
Expand Down Expand Up @@ -103,10 +104,16 @@ func NewDcgmScraper(opts DcgmScraperOpts) (*DcgmScraper, error) {

func getScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
return &config.ScrapeConfig{
HTTPClientConfig: configutil.HTTPClientConfig{
TLSConfig: configutil.TLSConfig{
CAFile: caFile,
InsecureSkipVerify: false,
},
},
ScrapeInterval: model.Duration(collectionInterval),
ScrapeTimeout: model.Duration(collectionInterval),
JobName: jobName,
Scheme: "http",
Scheme: "https",
MetricsPath: scraperMetricsPath,
ServiceDiscoveryConfigs: discovery.Configs{
&kubernetes.SDConfig{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package gpu

import (
"context"
"time"

"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
"github.com/prometheus/prometheus/discovery/kubernetes"
"github.com/prometheus/prometheus/model/relabel"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/receiver"
"go.uber.org/zap"

ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/prometheusscraper"
)

const (
caFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
collectionInterval = 60 * time.Second
jobName = "containerInsightsDCGMExporterScraper"
scraperMetricsPath = "/metrics"
scraperK8sServiceSelector = "k8s-app=dcgm-exporter-service"
)

type DcgmScraper struct {
ctx context.Context
settings component.TelemetrySettings
host component.Host
hostInfoProvider hostInfoProvider
prometheusReceiver receiver.Metrics
k8sDecorator prometheusscraper.Decorator
running bool
}

type DcgmScraperOpts struct {
Ctx context.Context
TelemetrySettings component.TelemetrySettings
Consumer consumer.Metrics
Host component.Host
HostInfoProvider hostInfoProvider
K8sDecorator prometheusscraper.Decorator
Logger *zap.Logger
}

type hostInfoProvider interface {
GetClusterName() string
GetInstanceID() string
}

func GetScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're moving it here, shouldn't it be removed from internal/gpu/dcgmscraper.go?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes will delete it correct, again mad merge

return &config.ScrapeConfig{
ScrapeInterval: model.Duration(collectionInterval),
ScrapeTimeout: model.Duration(collectionInterval),
JobName: jobName,
Scheme: "http",
MetricsPath: scraperMetricsPath,
ServiceDiscoveryConfigs: discovery.Configs{
&kubernetes.SDConfig{
Role: kubernetes.RoleService,
NamespaceDiscovery: kubernetes.NamespaceDiscovery{
IncludeOwnNamespace: true,
},
Selectors: []kubernetes.SelectorConfig{
{
Role: kubernetes.RoleService,
Label: scraperK8sServiceSelector,
},
},
},
},
MetricRelabelConfigs: getMetricRelabelConfig(hostInfoProvider),
}
}

func getMetricRelabelConfig(hostInfoProvider hostInfoProvider) []*relabel.Config {
return []*relabel.Config{
{
SourceLabels: model.LabelNames{"__name__"},
Regex: relabel.MustNewRegexp("DCGM_.*"),
Action: relabel.Keep,
},
{
SourceLabels: model.LabelNames{"Hostname"},
TargetLabel: ci.NodeNameKey,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: ci.AttributeK8sNamespace,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
// hacky way to inject static values (clusterName & instanceId) to label set without additional processor
// relabel looks up an existing label then creates another label with given key (TargetLabel) and value (static)
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: ci.ClusterNameKey,
Regex: relabel.MustNewRegexp(".*"),
Replacement: hostInfoProvider.GetClusterName(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"namespace"},
TargetLabel: ci.InstanceID,
Regex: relabel.MustNewRegexp(".*"),
Replacement: hostInfoProvider.GetInstanceID(),
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"pod"},
TargetLabel: ci.AttributeFullPodName,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
// additional k8s podname for service name and k8s blob decoration
{
SourceLabels: model.LabelNames{"pod"},
TargetLabel: ci.AttributeK8sPodName,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"container"},
TargetLabel: ci.AttributeContainerName,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"device"},
TargetLabel: ci.AttributeGpuDevice,
Regex: relabel.MustNewRegexp("(.*)"),
Replacement: "${1}",
Action: relabel.Replace,
},
}
}
Loading
Loading