-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ci neuron #1
Ci neuron #1
Changes from all commits
8920742
e0f5bcd
51fe859
cee2244
3debd28
3d9de49
9e70069
b9a0e03
52cd972
eeb90e2
1747a50
4f0e3e1
c95f590
609198b
1444acd
a89378e
d2c417d
00a12dc
d3bf111
622200a
9cb314e
a821803
83896ab
69969dd
3267653
164bd84
19223b1
c65ad64
1f60d15
c6966db
05b1c75
76e05aa
9e2f849
3168bb2
0c8eac2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ import ( | |
"fmt" | ||
"time" | ||
|
||
configutil "github.com/prometheus/common/config" | ||
"github.com/prometheus/common/model" | ||
"github.com/prometheus/prometheus/config" | ||
"github.com/prometheus/prometheus/discovery" | ||
|
@@ -25,7 +26,7 @@ import ( | |
) | ||
|
||
const ( | ||
caFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" | ||
caFile = "/etc/amazon-cloudwatch-observability-agent-cert/tls-ca.crt" | ||
Comment on lines
-28
to
+29
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the changes in this file are coming from 9cb314e - can we merge it to |
||
collectionInterval = 60 * time.Second | ||
jobName = "containerInsightsDCGMExporterScraper" | ||
scraperMetricsPath = "/metrics" | ||
|
@@ -103,10 +104,16 @@ func NewDcgmScraper(opts DcgmScraperOpts) (*DcgmScraper, error) { | |
|
||
func getScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig { | ||
return &config.ScrapeConfig{ | ||
HTTPClientConfig: configutil.HTTPClientConfig{ | ||
TLSConfig: configutil.TLSConfig{ | ||
CAFile: caFile, | ||
InsecureSkipVerify: false, | ||
}, | ||
}, | ||
ScrapeInterval: model.Duration(collectionInterval), | ||
ScrapeTimeout: model.Duration(collectionInterval), | ||
JobName: jobName, | ||
Scheme: "http", | ||
Scheme: "https", | ||
MetricsPath: scraperMetricsPath, | ||
ServiceDiscoveryConfigs: discovery.Configs{ | ||
&kubernetes.SDConfig{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package gpu | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"github.com/prometheus/common/model" | ||
"github.com/prometheus/prometheus/config" | ||
"github.com/prometheus/prometheus/discovery" | ||
"github.com/prometheus/prometheus/discovery/kubernetes" | ||
"github.com/prometheus/prometheus/model/relabel" | ||
"go.opentelemetry.io/collector/component" | ||
"go.opentelemetry.io/collector/consumer" | ||
"go.opentelemetry.io/collector/receiver" | ||
"go.uber.org/zap" | ||
|
||
ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight" | ||
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/prometheusscraper" | ||
) | ||
|
||
const ( | ||
caFile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" | ||
collectionInterval = 60 * time.Second | ||
jobName = "containerInsightsDCGMExporterScraper" | ||
scraperMetricsPath = "/metrics" | ||
scraperK8sServiceSelector = "k8s-app=dcgm-exporter-service" | ||
) | ||
|
||
type DcgmScraper struct { | ||
ctx context.Context | ||
settings component.TelemetrySettings | ||
host component.Host | ||
hostInfoProvider hostInfoProvider | ||
prometheusReceiver receiver.Metrics | ||
k8sDecorator prometheusscraper.Decorator | ||
running bool | ||
} | ||
|
||
type DcgmScraperOpts struct { | ||
Ctx context.Context | ||
TelemetrySettings component.TelemetrySettings | ||
Consumer consumer.Metrics | ||
Host component.Host | ||
HostInfoProvider hostInfoProvider | ||
K8sDecorator prometheusscraper.Decorator | ||
Logger *zap.Logger | ||
} | ||
|
||
type hostInfoProvider interface { | ||
GetClusterName() string | ||
GetInstanceID() string | ||
} | ||
|
||
func GetScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're moving it here, shouldn't it be removed from internal/gpu/dcgmscraper.go? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes will delete it correct, again mad merge |
||
return &config.ScrapeConfig{ | ||
ScrapeInterval: model.Duration(collectionInterval), | ||
ScrapeTimeout: model.Duration(collectionInterval), | ||
JobName: jobName, | ||
Scheme: "http", | ||
MetricsPath: scraperMetricsPath, | ||
ServiceDiscoveryConfigs: discovery.Configs{ | ||
&kubernetes.SDConfig{ | ||
Role: kubernetes.RoleService, | ||
NamespaceDiscovery: kubernetes.NamespaceDiscovery{ | ||
IncludeOwnNamespace: true, | ||
}, | ||
Selectors: []kubernetes.SelectorConfig{ | ||
{ | ||
Role: kubernetes.RoleService, | ||
Label: scraperK8sServiceSelector, | ||
}, | ||
}, | ||
}, | ||
}, | ||
MetricRelabelConfigs: getMetricRelabelConfig(hostInfoProvider), | ||
} | ||
} | ||
|
||
func getMetricRelabelConfig(hostInfoProvider hostInfoProvider) []*relabel.Config { | ||
return []*relabel.Config{ | ||
{ | ||
SourceLabels: model.LabelNames{"__name__"}, | ||
Regex: relabel.MustNewRegexp("DCGM_.*"), | ||
Action: relabel.Keep, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"Hostname"}, | ||
TargetLabel: ci.NodeNameKey, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"namespace"}, | ||
TargetLabel: ci.AttributeK8sNamespace, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
// hacky way to inject static values (clusterName & instanceId) to label set without additional processor | ||
// relabel looks up an existing label then creates another label with given key (TargetLabel) and value (static) | ||
{ | ||
SourceLabels: model.LabelNames{"namespace"}, | ||
TargetLabel: ci.ClusterNameKey, | ||
Regex: relabel.MustNewRegexp(".*"), | ||
Replacement: hostInfoProvider.GetClusterName(), | ||
Action: relabel.Replace, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"namespace"}, | ||
TargetLabel: ci.InstanceID, | ||
Regex: relabel.MustNewRegexp(".*"), | ||
Replacement: hostInfoProvider.GetInstanceID(), | ||
Action: relabel.Replace, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"pod"}, | ||
TargetLabel: ci.AttributeFullPodName, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
// additional k8s podname for service name and k8s blob decoration | ||
{ | ||
SourceLabels: model.LabelNames{"pod"}, | ||
TargetLabel: ci.AttributeK8sPodName, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"container"}, | ||
TargetLabel: ci.AttributeContainerName, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
{ | ||
SourceLabels: model.LabelNames{"device"}, | ||
TargetLabel: ci.AttributeGpuDevice, | ||
Regex: relabel.MustNewRegexp("(.*)"), | ||
Replacement: "${1}", | ||
Action: relabel.Replace, | ||
}, | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Merge error will fix