Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Neuron Scraper for scraping neuron monitor metrics #184

Merged
merged 62 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from 59 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
8920742
add dcgm exporter scraper and move prometheus scraper test mock to mo…
movence Feb 2, 2024
e0f5bcd
update emf exporter to handle GPU metrics with different metric types
movence Feb 13, 2024
51fe859
remove custom logic in emf exporter
movence Feb 14, 2024
cee2244
update gpu flag comment
movence Feb 14, 2024
3debd28
remove comments and test codes
movence Feb 14, 2024
3d9de49
add neuron monitor scraper
sam6134 Feb 15, 2024
9e70069
remove unused codes and rename scraper init funcs
movence Feb 16, 2024
b9a0e03
remove comments
movence Feb 16, 2024
52cd972
add changelog for gpu
movence Feb 16, 2024
eeb90e2
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 19, 2024
1747a50
Update Scraper for new metrics
sam6134 Feb 20, 2024
4f0e3e1
Make Neuron Scraper extension for simple prometheus scraper
sam6134 Feb 23, 2024
c95f590
Minor fixes
sam6134 Feb 23, 2024
609198b
EnableFlag default to false
sam6134 Feb 23, 2024
1444acd
add gpu metric consumer that uses k8s decorator for attributes
movence Feb 26, 2024
a89378e
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Feb 27, 2024
d2c417d
testing support
sam6134 Mar 1, 2024
00a12dc
debugging pod resources store
aditya-purang Mar 1, 2024
0d98f61
add dcgm exporter scraper and move prometheus scraper test mock to mo…
movence Feb 2, 2024
66a683b
update emf exporter to handle GPU metrics with different metric types
movence Feb 13, 2024
edc2cee
remove custom logic in emf exporter
movence Feb 14, 2024
a6443f1
update gpu flag comment
movence Feb 14, 2024
782f24a
remove comments and test codes
movence Feb 14, 2024
18db72a
remove unused codes and rename scraper init funcs
movence Feb 16, 2024
5d11f3c
remove comments
movence Feb 16, 2024
9ed6770
add changelog for gpu
movence Feb 16, 2024
e8af343
add gpu metric consumer that uses k8s decorator for attributes
movence Feb 26, 2024
0b7c895
address comments
movence Feb 27, 2024
bc8d269
handle the case with no dp
movence Feb 27, 2024
43ba471
consolidate CI metrics structs into single RawContainerInsights
movence Feb 29, 2024
5a4f394
update feature toggle flag name
movence Feb 29, 2024
3ee43f6
rename k8s attributes vars with Attribute prefix
movence Mar 1, 2024
6a2de95
merge conflicts
movence Mar 1, 2024
cc3bf6e
remove unused variable and code for setting metric type to it
movence Mar 1, 2024
852fff7
clean ups
movence Mar 1, 2024
a821803
add podresource scrapper and metric data printer
aditya-purang Mar 4, 2024
83896ab
refactor logMd
aditya-purang Mar 4, 2024
69969dd
Merge branch 'ci-nvidia-gpu' into ci-neuron
sam6134 Mar 4, 2024
3267653
Merge conflicts
sam6134 Mar 4, 2024
164bd84
More cleanups
sam6134 Mar 4, 2024
19223b1
Remove unused imports
sam6134 Mar 4, 2024
c65ad64
Add decorator to neuron scraper
sam6134 Mar 4, 2024
1f60d15
Merge branch 'ci-neuron' into docker-testing
sam6134 Mar 4, 2024
c6966db
Add decorator to add podResources
sam6134 Mar 5, 2024
05b1c75
Unified the decorator and added podResources decorator
sam6134 Mar 5, 2024
76e05aa
Minor fixes
sam6134 Mar 5, 2024
9e2f849
remove unused file
sam6134 Mar 5, 2024
3168bb2
Making Dcgm implement SimplePrometheusScraper
sam6134 Mar 6, 2024
0c8eac2
Merge branch 'ci-neuron' into ci-neuron
sam6134 Mar 6, 2024
72f324e
Merge pull request #1 from aditya-purang/ci-neuron
sam6134 Mar 6, 2024
b7d198a
Fix comments and merge conflicts
sam6134 Mar 7, 2024
9d35b4b
Merge branch 'aws-cwa-dev' into ci-neuron
sam6134 Mar 7, 2024
a20e010
more fixes
sam6134 Mar 7, 2024
a1d508a
Merge branch 'aws-cwa-dev' into ci-neuron
sam6134 Mar 7, 2024
def8c87
Fix linting, comments and links
sam6134 Mar 8, 2024
8aec9f5
Add more fix regarding license
sam6134 Mar 8, 2024
65771bb
Add additional labels for agent processor
sam6134 Mar 8, 2024
9f00e3d
Fix lint checks
sam6134 Mar 12, 2024
31b7d3a
fix go imports
sam6134 Mar 12, 2024
52ad0d0
Add HTTPS support for Neuron
sam6134 Mar 13, 2024
47c5f8c
Move Scrapers under one Flag and add more validations
sam6134 Mar 19, 2024
e67186d
Merge branch 'aws-cwa-dev' into ci-neuron
sam6134 Mar 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion exporter/awsemfexporter/internal/appsignals/useragent.go
sam6134 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package appsignals
package appsignals // import "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter/internal/appsignals"

import (
"context"
Expand Down
3 changes: 2 additions & 1 deletion exporter/awsemfexporter/metric_translator.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ func (mt metricTranslator) translateOTelToGroupedMetric(rm pmetric.ResourceMetri

if serviceName, ok := rm.Resource().Attributes().Get("service.name"); ok {
if strings.HasPrefix(serviceName.Str(), "containerInsightsKubeAPIServerScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") {
strings.HasPrefix(serviceName.Str(), "containerInsightsDCGMExporterScraper") ||
strings.HasPrefix(serviceName.Str(), "containerInsightsNeuronMonitorScraper") {
// the prometheus metrics that come from the container insight receiver need to be clearly tagged as coming from container insights
metricReceiver = containerInsightsReceiver
}
Expand Down
16 changes: 16 additions & 0 deletions exporter/awsemfexporter/metric_translator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
containerInsightMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsKubeAPIServerScraper")
gpuMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
gpuMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsDCGMExporterScraper")
neuronMetric := createTestResourceMetricsHelper(defaultNumberOfTestMetrics + 1)
neuronMetric.Resource().Attributes().PutStr(conventions.AttributeServiceName, "containerInsightsNeuronMonitorScraper")

counterSumMetrics := map[string]*metricInfo{
"spanCounter": {
Expand Down Expand Up @@ -390,6 +392,20 @@ func TestTranslateOtToGroupedMetric(t *testing.T) {
"myServiceNS/containerInsightsDCGMExporterScraper",
containerInsightsReceiver,
},
{
"neuron monitor receiver",
neuronMetric,
map[string]string{
"isItAnError": "false",
"spanName": "testSpan",
},
map[string]string{
oTellibDimensionKey: "cloudwatch-lib",
"spanName": "testSpan",
},
"myServiceNS/containerInsightsNeuronMonitorScraper",
containerInsightsReceiver,
},
}

for _, tc := range testCases {
Expand Down
11 changes: 6 additions & 5 deletions internal/aws/containerinsight/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,12 @@ const (
TypeContainerDiskIO = "ContainerDiskIO"
// Special type for pause container
// because containerd does not set container name pause container name to POD like docker does.
TypeInfraContainer = "InfraContainer"
TypeGpuContainer = "ContainerGPU"
TypeGpuPod = "PodGPU"
TypeGpuNode = "NodeGPU"
TypeGpuCluster = "ClusterGPU"
TypeInfraContainer = "InfraContainer"
TypeGpuContainer = "ContainerGPU"
TypeGpuPod = "PodGPU"
TypeGpuNode = "NodeGPU"
TypeGpuCluster = "ClusterGPU"
TypeNeuronContainer = "ContainerNeuron"

// unit
UnitBytes = "Bytes"
Expand Down
4 changes: 4 additions & 0 deletions receiver/awscontainerinsightreceiver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,8 @@ type Config struct {

// EnableAcceleratedComputeMetrics enabled features with accelerated compute resources where metrics are scraped from vendor specific sources
EnableAcceleratedComputeMetrics bool `mapstructure:"accelerated_compute_metrics"`

// EnableAwsNeuronMetrics toggles Neuron monitoring where metrics are scraped from neuron monitor
// The default value is false.
EnableAwsNeuronMetrics bool `mapstructure:"neuron_metrics"`
}
4 changes: 4 additions & 0 deletions receiver/awscontainerinsightreceiver/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ const (

// Don't enable EKS control plane metrics by default
defaultEnableControlPlaneMetrics = false

// Don't enable Neuron metrics by default
defaultEnableAwsNeuronMetrics = false
)

// NewFactory creates a factory for AWS container insight receiver
Expand All @@ -64,6 +67,7 @@ func createDefaultConfig() component.Config {
ClusterName: defaultClusterName,
LeaderLockName: defaultLeaderLockName,
EnableControlPlaneMetrics: defaultEnableControlPlaneMetrics,
EnableAwsNeuronMetrics: defaultEnableAwsNeuronMetrics,
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package extractors
package extractors // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/cadvisor/extractors"

import (
"fmt"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package gpu
package gpu // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/gpu"

import (
"context"
"errors"
"fmt"
"time"

configutil "github.com/prometheus/common/config"
Expand All @@ -15,14 +12,8 @@ import (
"github.com/prometheus/prometheus/discovery"
"github.com/prometheus/prometheus/discovery/kubernetes"
"github.com/prometheus/prometheus/model/relabel"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/consumer"
"go.opentelemetry.io/collector/pdata/pmetric"
"go.opentelemetry.io/collector/receiver"
"go.uber.org/zap"

ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
)

const (
Expand All @@ -33,76 +24,12 @@ const (
scraperK8sServiceSelector = "k8s-app=dcgm-exporter-service"
)

type DcgmScraper struct {
ctx context.Context
settings component.TelemetrySettings
host component.Host
hostInfoProvider hostInfoProvider
prometheusReceiver receiver.Metrics
k8sDecorator Decorator
running bool
}

type DcgmScraperOpts struct {
Ctx context.Context
TelemetrySettings component.TelemetrySettings
Consumer consumer.Metrics
Host component.Host
HostInfoProvider hostInfoProvider
K8sDecorator Decorator
Logger *zap.Logger
}

type hostInfoProvider interface {
GetClusterName() string
GetInstanceID() string
}

func NewDcgmScraper(opts DcgmScraperOpts) (*DcgmScraper, error) {
if opts.Consumer == nil {
return nil, errors.New("consumer cannot be nil")
}
if opts.Host == nil {
return nil, errors.New("host cannot be nil")
}
if opts.HostInfoProvider == nil {
return nil, errors.New("cluster name provider cannot be nil")
}

promConfig := prometheusreceiver.Config{
PrometheusConfig: &config.Config{
ScrapeConfigs: []*config.ScrapeConfig{getScraperConfig(opts.HostInfoProvider)},
},
}

params := receiver.CreateSettings{
TelemetrySettings: opts.TelemetrySettings,
}

decoConsumer := decorateConsumer{
containerOrchestrator: ci.EKS,
nextConsumer: opts.Consumer,
k8sDecorator: opts.K8sDecorator,
logger: opts.Logger,
}

promFactory := prometheusreceiver.NewFactory()
promReceiver, err := promFactory.CreateMetricsReceiver(opts.Ctx, params, &promConfig, &decoConsumer)
if err != nil {
return nil, fmt.Errorf("failed to create prometheus receiver: %w", err)
}

return &DcgmScraper{
ctx: opts.Ctx,
settings: opts.TelemetrySettings,
host: opts.Host,
hostInfoProvider: opts.HostInfoProvider,
prometheusReceiver: promReceiver,
k8sDecorator: opts.K8sDecorator,
}, nil
}

func getScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
func GetScraperConfig(hostInfoProvider hostInfoProvider) *config.ScrapeConfig {
return &config.ScrapeConfig{
HTTPClientConfig: configutil.HTTPClientConfig{
TLSConfig: configutil.TLSConfig{
Expand Down Expand Up @@ -201,35 +128,3 @@ func getMetricRelabelConfig(hostInfoProvider hostInfoProvider) []*relabel.Config
},
}
}

func (ds *DcgmScraper) GetMetrics() []pmetric.Metrics {
// This method will never return metrics because the metrics are collected by the scraper.
// This method will ensure the scraper is running
if !ds.running {
ds.settings.Logger.Info("The scraper is not running, starting up the scraper")
err := ds.prometheusReceiver.Start(ds.ctx, ds.host)
if err != nil {
ds.settings.Logger.Error("Unable to start PrometheusReceiver", zap.Error(err))
}
ds.running = err == nil
}

return nil
}

func (ds *DcgmScraper) Shutdown() {
if ds.running {
err := ds.prometheusReceiver.Shutdown(ds.ctx)
if err != nil {
ds.settings.Logger.Error("Unable to shutdown PrometheusReceiver", zap.Error(err))
}
ds.running = false
}

if ds.k8sDecorator != nil {
err := ds.k8sDecorator.Shutdown()
if err != nil {
ds.settings.Logger.Error("Unable to shutdown K8sDecorator", zap.Error(err))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (

ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/mocks"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/stores"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver/internal/prometheusscraper"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
)

Expand Down Expand Up @@ -50,17 +50,6 @@ func (m mockHostInfoProvider) GetInstanceID() string {
return dummyInstanceID
}

type mockDecorator struct {
}

func (m mockDecorator) Decorate(metric stores.CIMetric) stores.CIMetric {
return metric
}

func (m mockDecorator) Shutdown() error {
return nil
}

type mockConsumer struct {
t *testing.T
called *bool
Expand Down Expand Up @@ -103,42 +92,6 @@ func (m mockConsumer) ConsumeMetrics(_ context.Context, md pmetric.Metrics) erro
return nil
}

func TestNewDcgmScraperBadInputs(t *testing.T) {
settings := componenttest.NewNopTelemetrySettings()
settings.Logger, _ = zap.NewDevelopment()

tests := []DcgmScraperOpts{
{
Ctx: context.TODO(),
TelemetrySettings: settings,
Consumer: nil,
Host: componenttest.NewNopHost(),
HostInfoProvider: mockHostInfoProvider{},
},
{
Ctx: context.TODO(),
TelemetrySettings: settings,
Consumer: mockConsumer{},
Host: nil,
HostInfoProvider: mockHostInfoProvider{},
},
{
Ctx: context.TODO(),
TelemetrySettings: settings,
Consumer: mockConsumer{},
Host: componenttest.NewNopHost(),
HostInfoProvider: nil,
},
}

for _, tt := range tests {
scraper, err := NewDcgmScraper(tt)

assert.Error(t, err)
assert.Nil(t, scraper)
}
}

func TestNewDcgmScraperEndToEnd(t *testing.T) {
expected := map[string]struct {
value float64
Expand Down Expand Up @@ -182,16 +135,17 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {
settings := componenttest.NewNopTelemetrySettings()
settings.Logger, _ = zap.NewDevelopment()

scraper, err := NewDcgmScraper(DcgmScraperOpts{
scraper, err := prometheusscraper.NewSimplePrometheusScraper(prometheusscraper.SimplePrometheusScraperOpts{
Ctx: context.TODO(),
TelemetrySettings: settings,
Consumer: consumer,
Host: componenttest.NewNopHost(),
HostInfoProvider: mockHostInfoProvider{},
K8sDecorator: mockDecorator{},
ScraperConfigs: GetScraperConfig(mockHostInfoProvider{}),
Logger: settings.Logger,
})
assert.NoError(t, err)
assert.Equal(t, mockHostInfoProvider{}, scraper.hostInfoProvider)
assert.Equal(t, mockHostInfoProvider{}, scraper.HostInfoProvider)

// build up a new PR
promFactory := prometheusreceiver.NewFactory()
Expand All @@ -207,7 +161,7 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {
mp, cfg, err := mocks.SetupMockPrometheus(targets...)
assert.NoError(t, err)

scrapeConfig := getScraperConfig(scraper.hostInfoProvider)
scrapeConfig := scraper.ScraperConfigs
scrapeConfig.ScrapeInterval = cfg.ScrapeConfigs[0].ScrapeInterval
scrapeConfig.ScrapeTimeout = cfg.ScrapeConfigs[0].ScrapeInterval
scrapeConfig.Scheme = "http"
Expand Down Expand Up @@ -238,9 +192,10 @@ func TestNewDcgmScraperEndToEnd(t *testing.T) {

// replace the prom receiver
params := receiver.CreateSettings{
TelemetrySettings: scraper.settings,
TelemetrySettings: scraper.Settings,
}
scraper.prometheusReceiver, err = promFactory.CreateMetricsReceiver(scraper.ctx, params, &promConfig, consumer)
scraper.PrometheusReceiver, err = promFactory.CreateMetricsReceiver(scraper.Ctx, params, &promConfig, consumer)

assert.NoError(t, err)
assert.NotNil(t, mp)
defer mp.Close()
Expand Down
Loading
Loading