From e2ffe2bb0d1f901703270e4dbb0de5a25c216d44 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 11:44:03 +0200 Subject: [PATCH] Enable log errors check test and filter for acceptable errors (#3616) (#3637) (cherry picked from commit 96d46f9073d95a6fe79274f006efe43860f9c78f) Co-authored-by: Michal Pristas --- .../application/dispatcher/dispatcher.go | 2 +- .../gateway/fleet/fleet_gateway.go | 2 +- .../agent/application/monitoring/server.go | 2 +- .../pkg/agent/storage/store/action_store.go | 2 +- .../pkg/agent/storage/store/state_store.go | 2 +- internal/pkg/capabilities/upgrade.go | 2 +- .../pkg/fleetapi/acker/lazy/lazy_acker.go | 2 +- pkg/testing/tools/estools/elasticsearch.go | 7 +++++ testing/integration/monitoring_logs_test.go | 26 ++++++++++++++++--- 9 files changed, 36 insertions(+), 11 deletions(-) diff --git a/internal/pkg/agent/application/dispatcher/dispatcher.go b/internal/pkg/agent/application/dispatcher/dispatcher.go index a4ec47a96fe..92ac050f9ab 100644 --- a/internal/pkg/agent/application/dispatcher/dispatcher.go +++ b/internal/pkg/agent/application/dispatcher/dispatcher.go @@ -248,7 +248,7 @@ func (ad *ActionDispatcher) scheduleRetry(ctx context.Context, action fleetapi.R attempt := action.RetryAttempt() d, err := ad.rt.GetWait(attempt) if err != nil { - ad.log.Errorf("No more reties for action id %s: %v", action.ID(), err) + ad.log.Errorf("No more retries for action id %s: %v", action.ID(), err) action.SetRetryAttempt(-1) if err := acker.Ack(ctx, action); err != nil { ad.log.Errorf("Unable to ack action failure (id %s) to fleet-server: %v", action.ID(), err) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 31c81955a10..000ec534bf2 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -217,7 +217,7 @@ func (f *FleetGateway) doExecute(ctx context.Context, bo backoff.Backoff) (*flee if f.checkinFailCounter > 0 { // Log at same level as error logs above so subsequent successes are visible when log level is set to 'error'. - f.log.Errorf("Checkin request to fleet-server succeeded after %d failures", f.checkinFailCounter) + f.log.Warnf("Checkin request to fleet-server succeeded after %d failures", f.checkinFailCounter) } f.checkinFailCounter = 0 diff --git a/internal/pkg/agent/application/monitoring/server.go b/internal/pkg/agent/application/monitoring/server.go index 390a472d5ed..47561d29e49 100644 --- a/internal/pkg/agent/application/monitoring/server.go +++ b/internal/pkg/agent/application/monitoring/server.go @@ -35,7 +35,7 @@ func NewServer( ) (*api.Server, error) { if err := createAgentMonitoringDrop(endpointConfig.Host); err != nil { // log but ignore - log.Errorf("failed to create monitoring drop: %v", err) + log.Warnf("failed to create monitoring drop: %v", err) } cfg, err := config.NewConfigFrom(endpointConfig) diff --git a/internal/pkg/agent/storage/store/action_store.go b/internal/pkg/agent/storage/store/action_store.go index ea0b2eb3c8b..4fc9df8b485 100644 --- a/internal/pkg/agent/storage/store/action_store.go +++ b/internal/pkg/agent/storage/store/action_store.go @@ -33,7 +33,7 @@ func newActionStore(log *logger.Logger, store storeLoad) (*actionStore, error) { // and return an empty store. reader, err := store.Load() if err != nil { - log.Errorf("failed to load action store, returning empty contents: %v", err.Error()) + log.Warnf("failed to load action store, returning empty contents: %v", err.Error()) return &actionStore{log: log, store: store}, nil } defer reader.Close() diff --git a/internal/pkg/agent/storage/store/state_store.go b/internal/pkg/agent/storage/store/state_store.go index 6f64f1184bf..3e794c3547b 100644 --- a/internal/pkg/agent/storage/store/state_store.go +++ b/internal/pkg/agent/storage/store/state_store.go @@ -95,7 +95,7 @@ func NewStateStore(log *logger.Logger, store storeLoad) (*StateStore, error) { // and return an empty store. reader, err := store.Load() if err != nil { - log.Errorf("failed to load state store, returning empty contents: %v", err.Error()) + log.Warnf("failed to load state store, returning empty contents: %v", err.Error()) return &StateStore{log: log, store: store}, nil } defer reader.Close() diff --git a/internal/pkg/capabilities/upgrade.go b/internal/pkg/capabilities/upgrade.go index 07866ec111e..0f7b19babf8 100644 --- a/internal/pkg/capabilities/upgrade.go +++ b/internal/pkg/capabilities/upgrade.go @@ -65,7 +65,7 @@ func allowUpgrade( for _, cap := range upgradeCaps { result, err := cap.condition.Eval(varStore, true) if err != nil { - log.Errorf("failed evaluating eql formula %q, skipping: %v", cap.conditionStr, err) + log.Warnf("failed evaluating eql formula %q, skipping: %v", cap.conditionStr, err) continue } if result { diff --git a/internal/pkg/fleetapi/acker/lazy/lazy_acker.go b/internal/pkg/fleetapi/acker/lazy/lazy_acker.go index 298b2b5bf7f..65f7bdc1cac 100644 --- a/internal/pkg/fleetapi/acker/lazy/lazy_acker.go +++ b/internal/pkg/fleetapi/acker/lazy/lazy_acker.go @@ -87,7 +87,7 @@ func (f *Acker) Commit(ctx context.Context) (err error) { // If request failed enqueue all actions with retrier if it is set if err != nil { if f.retrier != nil { - f.log.Errorf("lazy acker: failed ack batch, enqueue for retry: %s", actions) + f.log.Warnf("lazy acker: failed ack batch, enqueue for retry: %s", actions) f.retrier.Enqueue(actions) return nil } diff --git a/pkg/testing/tools/estools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go index 8cd6e126597..ca6dad2dba4 100644 --- a/pkg/testing/tools/estools/elasticsearch.go +++ b/pkg/testing/tools/estools/elasticsearch.go @@ -201,6 +201,13 @@ func CheckForErrorsInLogsWithContext(ctx context.Context, client elastictranspor "log.level": "error", }, }, + { + "term": map[string]interface{}{ + "data_stream.namespace": map[string]interface{}{ + "value": namespace, + }, + }, + }, }, "must_not": excludeStatements, }, diff --git a/testing/integration/monitoring_logs_test.go b/testing/integration/monitoring_logs_test.go index 97836c7ff3f..c52b2150d47 100644 --- a/testing/integration/monitoring_logs_test.go +++ b/testing/integration/monitoring_logs_test.go @@ -38,7 +38,6 @@ func TestMonitoringLogsShipped(t *testing.T) { ctx := context.Background() t.Logf("got namespace: %s", info.Namespace) - t.Skip("Test is flaky; see https://github.com/elastic/elastic-agent/issues/3081") agentFixture, err := define.NewFixture(t, define.Version()) require.NoError(t, err) @@ -90,7 +89,7 @@ func TestMonitoringLogsShipped(t *testing.T) { require.NotZero(t, len(docs.Hits.Hits)) t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits)) - // Stage 4: make sure all components are health + // Stage 4: make sure all components are healthy t.Log("Making sure all components are healthy") status, err := agentFixture.ExecStatus(ctx) require.NoError(t, err, @@ -101,7 +100,26 @@ func TestMonitoringLogsShipped(t *testing.T) { c.Name, client.Healthy, client.State(c.State)) } - // Stage 5: Make sure we have message confirming central management is running + // Stage 5: Make sure there are no errors in logs + t.Log("Making sure there are no error logs") + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.CheckForErrorsInLogs(info.ESClient, info.Namespace, []string{ + // acceptable error messages (include reason) + "Error dialing dial tcp 127.0.0.1:9200: connect: connection refused", // beat is running default config before its config gets updated + "Global configuration artifact is not available", // Endpoint: failed to load user artifact due to connectivity issues + "Failed to download artifact", + "Failed to initialize artifact", + "Failed to apply initial policy from on disk configuration", + "elastic-agent-client error: rpc error: code = Canceled desc = context canceled", // can happen on restart + }) + }) + t.Logf("errors: Got %d documents", len(docs.Hits.Hits)) + for _, doc := range docs.Hits.Hits { + t.Logf("%#v", doc.Source) + } + require.Empty(t, docs.Hits.Hits) + + // Stage 6: Make sure we have message confirming central management is running t.Log("Making sure we have message confirming central management is running") docs = findESDocs(t, func() (estools.Documents, error) { return estools.FindMatchingLogLines(info.ESClient, info.Namespace, @@ -109,7 +127,7 @@ func TestMonitoringLogsShipped(t *testing.T) { }) require.NotZero(t, len(docs.Hits.Hits)) - // Stage 6: verify logs from the monitoring components are not sent to the output + // Stage 7: verify logs from the monitoring components are not sent to the output t.Log("Check monitoring logs") hostname, err := os.Hostname() if err != nil {