From fa4da883751d36f1dda0aa3d3eba340db5bd5719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Wei=C3=9Fe?= <66256922+daniel-weisse@users.noreply.github.com> Date: Mon, 25 Sep 2023 12:10:07 +0200 Subject: [PATCH] cli: report log collection failure to user (#2354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Report log collection failure to user * Try collecting logs for more error cases --------- Signed-off-by: Daniel Weiße --- cli/internal/cmd/init.go | 62 ++++++++++++++++++++++++++++++----- cli/internal/cmd/init_test.go | 57 ++++++++++++++++++++++++-------- 2 files changed, 96 insertions(+), 23 deletions(-) diff --git a/cli/internal/cmd/init.go b/cli/internal/cmd/init.go index 8663ec0fbd..a63502d8be 100644 --- a/cli/internal/cmd/init.go +++ b/cli/internal/cmd/init.go @@ -235,7 +235,11 @@ func (i *initCmd) initialize( if errors.As(err, &nonRetriable) { cmd.PrintErrln("Cluster initialization failed. This error is not recoverable.") cmd.PrintErrln("Terminate your cluster and try again.") - cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog)) + if nonRetriable.logCollectionErr != nil { + cmd.PrintErrf("Failed to collect logs from bootstrapper: %s\n", nonRetriable.logCollectionErr) + } else { + cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog)) + } } return err } @@ -330,7 +334,10 @@ func (d *initDoer) Do(ctx context.Context) error { // connectedOnce is set in handleGRPCStateChanges when a connection was established in one retry attempt. // This should cancel any other retry attempts when the connection is lost since the bootstrapper likely won't accept any new attempts anymore. if d.connectedOnce { - return &nonRetriableError{errors.New("init already connected to the remote server in a previous attempt - resumption is not supported")} + return &nonRetriableError{ + logCollectionErr: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"), + err: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"), + } } conn, err := d.dialer.Dial(ctx, d.endpoint) @@ -351,31 +358,58 @@ func (d *initDoer) Do(ctx context.Context) error { d.log.Debugf("Created protoClient") resp, err := protoClient.Init(ctx, d.req) if err != nil { - return &nonRetriableError{fmt.Errorf("init call: %w", err)} + return &nonRetriableError{ + logCollectionErr: errors.New("rpc failed before first response was received - no logs available"), + err: fmt.Errorf("init call: %w", err), + } } res, err := resp.Recv() // get first response, either success or failure if err != nil { if e := d.getLogs(resp); e != nil { d.log.Debugf("Failed to collect logs: %s", e) + return &nonRetriableError{ + logCollectionErr: e, + err: err, + } } - return &nonRetriableError{err} + return &nonRetriableError{err: err} } switch res.Kind.(type) { case *initproto.InitResponse_InitFailure: if e := d.getLogs(resp); e != nil { d.log.Debugf("Failed to get logs from cluster: %s", e) + return &nonRetriableError{ + logCollectionErr: e, + err: errors.New(res.GetInitFailure().GetError()), + } } - return &nonRetriableError{errors.New(res.GetInitFailure().GetError())} + return &nonRetriableError{err: errors.New(res.GetInitFailure().GetError())} case *initproto.InitResponse_InitSuccess: d.resp = res.GetInitSuccess() case nil: d.log.Debugf("Cluster returned nil response type") - return &nonRetriableError{errors.New("empty response from cluster")} + err = errors.New("empty response from cluster") + if e := d.getLogs(resp); e != nil { + d.log.Debugf("Failed to collect logs: %s", e) + return &nonRetriableError{ + logCollectionErr: e, + err: err, + } + } + return &nonRetriableError{err: err} default: d.log.Debugf("Cluster returned unknown response type") - return &nonRetriableError{errors.New("unknown response from cluster")} + err = errors.New("unknown response from cluster") + if e := d.getLogs(resp); e != nil { + d.log.Debugf("Failed to collect logs: %s", e) + return &nonRetriableError{ + logCollectionErr: e, + err: err, + } + } + return &nonRetriableError{err: err} } return nil @@ -392,9 +426,18 @@ func (d *initDoer) getLogs(resp initproto.API_InitClient) error { return err } + switch res.Kind.(type) { + case *initproto.InitResponse_InitFailure: + return errors.New("trying to collect logs: received init failure response, expected log response") + case *initproto.InitResponse_InitSuccess: + return errors.New("trying to collect logs: received init success response, expected log response") + case nil: + return errors.New("trying to collect logs: received nil response, expected log response") + } + log := res.GetLog().GetLog() if log == nil { - return errors.New("sent empty logs") + return errors.New("received empty logs") } if err := d.fh.Write(constants.ErrorLog, log, file.OptAppend); err != nil { @@ -609,7 +652,8 @@ type grpcDialer interface { } type nonRetriableError struct { - err error + logCollectionErr error + err error } // Error returns the error message. diff --git a/cli/internal/cmd/init_test.go b/cli/internal/cmd/init_test.go index a6a75202d6..77b0b60099 100644 --- a/cli/internal/cmd/init_test.go +++ b/cli/internal/cmd/init_test.go @@ -90,22 +90,47 @@ func TestInitialize(t *testing.T) { idFile: &clusterid.File{IP: "192.0.2.1"}, configMutator: func(c *config.Config) { c.Provider.GCP.ServiceAccountKeyPath = serviceAccPath }, serviceAccKey: gcpServiceAccKey, - initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, + initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}}, }, "initialize some azure instances": { provider: cloudprovider.Azure, idFile: &clusterid.File{IP: "192.0.2.1"}, - initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, + initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}}, }, "initialize some qemu instances": { provider: cloudprovider.QEMU, idFile: &clusterid.File{IP: "192.0.2.1"}, - initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, + initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}}, }, "non retriable error": { provider: cloudprovider.QEMU, idFile: &clusterid.File{IP: "192.0.2.1"}, - initServerAPI: &stubInitServer{initErr: &nonRetriableError{assert.AnError}}, + initServerAPI: &stubInitServer{initErr: &nonRetriableError{err: assert.AnError}}, + retriable: false, + masterSecretShouldExist: true, + wantErr: true, + }, + "non retriable error with failed log collection": { + provider: cloudprovider.QEMU, + idFile: &clusterid.File{IP: "192.0.2.1"}, + initServerAPI: &stubInitServer{ + res: []*initproto.InitResponse{ + { + Kind: &initproto.InitResponse_InitFailure{ + InitFailure: &initproto.InitFailureResponse{ + Error: "error", + }, + }, + }, + { + Kind: &initproto.InitResponse_InitFailure{ + InitFailure: &initproto.InitFailureResponse{ + Error: "error", + }, + }, + }, + }, + }, retriable: false, masterSecretShouldExist: true, wantErr: true, @@ -132,7 +157,7 @@ func TestInitialize(t *testing.T) { "k8s version without v works": { provider: cloudprovider.Azure, idFile: &clusterid.File{IP: "192.0.2.1"}, - initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, + initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}}, configMutator: func(c *config.Config) { res, err := versions.NewValidK8sVersion(strings.TrimPrefix(string(versions.Default), "v"), true) require.NoError(t, err) @@ -142,7 +167,7 @@ func TestInitialize(t *testing.T) { "outdated k8s patch version doesn't work": { provider: cloudprovider.Azure, idFile: &clusterid.File{IP: "192.0.2.1"}, - initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, + initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}}, configMutator: func(c *config.Config) { v, err := semver.New(versions.SupportedK8sVersions()[0]) require.NoError(t, err) @@ -458,12 +483,14 @@ func TestAttestation(t *testing.T) { assert := assert.New(t) require := require.New(t) - initServerAPI := &stubInitServer{res: &initproto.InitResponse{ - Kind: &initproto.InitResponse_InitSuccess{ - InitSuccess: &initproto.InitSuccessResponse{ - Kubeconfig: []byte("kubeconfig"), - OwnerId: []byte("ownerID"), - ClusterId: []byte("clusterID"), + initServerAPI := &stubInitServer{res: []*initproto.InitResponse{ + { + Kind: &initproto.InitResponse_InitSuccess{ + InitSuccess: &initproto.InitSuccessResponse{ + Kubeconfig: []byte("kubeconfig"), + OwnerId: []byte("ownerID"), + ClusterId: []byte("clusterID"), + }, }, }, }} @@ -577,14 +604,16 @@ func (i *testIssuer) Issue(_ context.Context, userData []byte, _ []byte) ([]byte } type stubInitServer struct { - res *initproto.InitResponse + res []*initproto.InitResponse initErr error initproto.UnimplementedAPIServer } func (s *stubInitServer) Init(_ *initproto.InitRequest, stream initproto.API_InitServer) error { - _ = stream.Send(s.res) + for _, r := range s.res { + _ = stream.Send(r) + } return s.initErr }