From 3c5b37549473c87e9bb85db581a7200012d83c96 Mon Sep 17 00:00:00 2001 From: Michel Laterman <82832767+michel-laterman@users.noreply.github.com> Date: Mon, 24 Jun 2024 09:02:26 -0700 Subject: [PATCH] Add tlscommon and httpcommon diagnostics hooks (#3587) Add custom hooks to use diag hooks added in elastic/elastic-agent-libs#207 to provide additional files that contain information about the TLS certs used by the server's API, TLS infomation used when connecting to elasticsearch, and a full trace to each specified elasticsearch host. --- ...84-Add-tlscommon-and-httpcommon-hooks.yaml | 36 ++++++++++++++ internal/pkg/api/openapi.gen.go | 3 +- internal/pkg/config/output.go | 49 +++++++++++++++++++ internal/pkg/config/output_test.go | 18 +++++++ internal/pkg/server/agent.go | 38 ++++++++++++++ model/openapi.yml | 1 + pkg/api/types.gen.go | 3 +- 7 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 changelog/fragments/1718034684-Add-tlscommon-and-httpcommon-hooks.yaml diff --git a/changelog/fragments/1718034684-Add-tlscommon-and-httpcommon-hooks.yaml b/changelog/fragments/1718034684-Add-tlscommon-and-httpcommon-hooks.yaml new file mode 100644 index 000000000..d4eb071b3 --- /dev/null +++ b/changelog/fragments/1718034684-Add-tlscommon-and-httpcommon-hooks.yaml @@ -0,0 +1,36 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Add tlscommon and httpcommon diagnostic information + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: | + Add tlscommon and httpcommon hooks for fleet-server to add API and + output TLS diagnostics files when diagnostics are collected as well as + an httpcommon diagnostics trace for fleet-server's connection to + Elasticsearch. + +# Affected component; a word indicating the component this changeset affects. +component: + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/fleet-server/pull/3587 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/api/openapi.gen.go b/internal/pkg/api/openapi.gen.go index 10b4243fe..743ebab50 100644 --- a/internal/pkg/api/openapi.gen.go +++ b/internal/pkg/api/openapi.gen.go @@ -37,7 +37,8 @@ const ( // Defines values for ActionRequestDiagnosticsAdditionalMetrics. const ( - CPU ActionRequestDiagnosticsAdditionalMetrics = "CPU" + CONN ActionRequestDiagnosticsAdditionalMetrics = "CONN" + CPU ActionRequestDiagnosticsAdditionalMetrics = "CPU" ) // Defines values for ActionSettingsLogLevel. diff --git a/internal/pkg/config/output.go b/internal/pkg/config/output.go index 185d68542..e91dda87c 100644 --- a/internal/pkg/config/output.go +++ b/internal/pkg/config/output.go @@ -5,6 +5,8 @@ package config import ( + "bytes" + "context" "fmt" "net" "net/http" @@ -16,8 +18,10 @@ import ( "time" urlutil "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent-libs/transport/httpcommon" "github.com/elastic/elastic-agent-libs/transport/tlscommon" "github.com/elastic/go-elasticsearch/v8" + "github.com/rs/zerolog" ) // The timeout would be driven by the server for long poll. @@ -228,3 +232,48 @@ func makeURL(defaultScheme string, defaultPath string, rawURL string, defaultPor addr.Host = host + ":" + port return addr.String(), nil } + +func (c *Elasticsearch) DiagRequests(ctx context.Context) []byte { + pURL, err := httpcommon.NewProxyURIFromString(c.ProxyURL) + if err != nil { + zerolog.Ctx(ctx).Warn().Err(err).Msg("Unable to transform proxy_url to url.URL") + } + settings := httpcommon.HTTPTransportSettings{ + TLS: c.TLS, + Timeout: c.Timeout, + Proxy: httpcommon.HTTPClientProxySettings{ + Disable: c.ProxyDisable, + URL: pURL, + Headers: httpcommon.ProxyHeaders(c.ProxyHeaders), + }, + } + headers := http.Header{} + for k, v := range c.Headers { + headers.Set(k, v) + } + + reqs := make([]*http.Request, 0, len(c.Hosts)) + + var res bytes.Buffer + for _, host := range c.Hosts { + u, err := url.Parse(host) + if err != nil { + zerolog.Ctx(ctx).Warn().Err(err).Str("host", host).Msg("Unable to transform host to url.URL") + res.WriteString(fmt.Sprintf("Unable to transform host %q to url.URL: %v\n", host, err)) + continue + } + if u.Scheme == "" { + u.Scheme = c.Protocol + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) + if err != nil { + zerolog.Ctx(ctx).Warn().Err(err).Str("host", host).Msg("Unable to create request to host") + res.WriteString(fmt.Sprintf("Unable to create request to host %q: %v\n", host, err)) + continue + } + req.Header = headers.Clone() + reqs = append(reqs, req) + } + res.Write(settings.DiagRequests(reqs)()) + return res.Bytes() +} diff --git a/internal/pkg/config/output_test.go b/internal/pkg/config/output_test.go index a2fdf5cfd..11cc2c699 100644 --- a/internal/pkg/config/output_test.go +++ b/internal/pkg/config/output_test.go @@ -8,8 +8,10 @@ package config import ( + "context" "crypto/tls" "net/http" + "net/http/httptest" "os" "path/filepath" "testing" @@ -382,3 +384,19 @@ func setTestEnv(t *testing.T, env map[string]string) { t.Setenv(k, v) } } + +func Test_Elasticsearch_DiagRequests(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + es := &Elasticsearch{} + es.InitDefaults() + es.Hosts = []string{srv.URL} + + p := es.DiagRequests(ctx) + require.NotEmpty(t, p) + require.Contains(t, string(p), "request 0 successful.") +} diff --git a/internal/pkg/server/agent.go b/internal/pkg/server/agent.go index e004d2786..955ac1ada 100644 --- a/internal/pkg/server/agent.go +++ b/internal/pkg/server/agent.go @@ -115,6 +115,44 @@ func (a *Agent) Run(ctx context.Context) error { } return p }) + a.agent.RegisterDiagnosticHook("fleet-server api tls diag", "fleet-server's API TLS config", "fleet-server-api-tls.txt", "text/plain", func() []byte { + if a.srv == nil { + log.Warn().Msg("Diagnostics hook failure fleet-server is nil.") + return []byte(`Diagnostics hook failure fleet-server is nil`) + } + cfg := a.srv.GetConfig() + if cfg == nil || len(cfg.Inputs) == 0 { + log.Warn().Msg("Diagnostics hook failure config is nil.") + return []byte(`Diagnostics hook failure config is nil`) + } + return cfg.Inputs[0].Server.TLS.DiagCerts()() + }) + a.agent.RegisterDiagnosticHook("fleet-server output tls diag", "fleet-server's output TLS config", "fleet-server-output-tls.txt", "text/plain", func() []byte { + if a.srv == nil { + log.Warn().Msg("Diagnostics hook failure fleet-server is nil.") + return []byte(`Diagnostics hook failure fleet-server is nil`) + } + cfg := a.srv.GetConfig() + if cfg == nil { + log.Warn().Msg("Diagnostics hook failure config is nil.") + return []byte(`Diagnostics hook failure config is nil`) + } + return cfg.Output.Elasticsearch.TLS.DiagCerts()() + }) + a.agent.RegisterOptionalDiagnosticHook("CONN", "fleet-server output request diag", "fleet-server output request trace diagnostics", "fleet-server-output-request.txt", "text/plain", func() []byte { + if a.srv == nil { + log.Warn().Msg("Diagnostics hook failure fleet-server is nil.") + return []byte(`Diagnostics hook failure fleet-server is nil`) + } + cfg := a.srv.GetConfig() + if cfg == nil { + log.Warn().Msg("Diagnostics hook failure config is nil.") + return []byte(`Diagnostics hook failure config is nil`) + } + ctx, cancel := context.WithTimeout(ctx, time.Second*30) // TODO(michel-laterman): duration/timeout should be part of the diagnostics action from fleet-server (https://github.com/elastic/fleet-server/issues/3648) and the control protocol (https://github.com/elastic/elastic-agent-client/issues/113) + defer cancel() + return cfg.Output.Elasticsearch.DiagRequests(ctx) + }) subCtx, subCanceller := context.WithCancel(ctx) defer subCanceller() diff --git a/model/openapi.yml b/model/openapi.yml index a69a355ae..29846e800 100644 --- a/model/openapi.yml +++ b/model/openapi.yml @@ -557,6 +557,7 @@ components: type: string enum: - CPU + - CONN actionPolicyReassign: description: The POLICY_REASSIGN action data. type: object diff --git a/pkg/api/types.gen.go b/pkg/api/types.gen.go index 38edf510d..bcd512b9a 100644 --- a/pkg/api/types.gen.go +++ b/pkg/api/types.gen.go @@ -34,7 +34,8 @@ const ( // Defines values for ActionRequestDiagnosticsAdditionalMetrics. const ( - CPU ActionRequestDiagnosticsAdditionalMetrics = "CPU" + CONN ActionRequestDiagnosticsAdditionalMetrics = "CONN" + CPU ActionRequestDiagnosticsAdditionalMetrics = "CPU" ) // Defines values for ActionSettingsLogLevel.