Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug Report: empty ListShardTablet result when remote cell topology is down #13801

Closed
DeathBorn opened this issue Aug 17, 2023 · 12 comments · Fixed by #15829
Closed

Bug Report: empty ListShardTablet result when remote cell topology is down #13801

DeathBorn opened this issue Aug 17, 2023 · 12 comments · Fixed by #15829

Comments

@DeathBorn
Copy link
Contributor

Overview of the Issue

When remote cell topology is unreachable (example: in case of cutting internet cables), vtctl binary and vtctlclient/vtctld GRPC api return different results.

Example test case provided with 2 tablets in different cells. When second tablet cell topology is down,

  • vtctl would return only available cell tablet
  • vtctlclient/vtctld GRPC api would take RemoteOperationTimeout long to execute and nothing would be returned

We should return at leat available cell tablets. In this case, many applications which rely on GRPC results might brake in unexpected ways.

Reproduction Steps

Unit tests

unit test file placed here go/vt/vtctld/api2_test.go

package vtctld

import (
	"fmt"
	"io"
	"net"
	"os"
	"os/exec"
	"path"
	"testing"
	"time"

	"context"

	clientv3 "go.etcd.io/etcd/client/v3"
	"google.golang.org/grpc"
	"vitess.io/vitess/go/testfiles"
	"vitess.io/vitess/go/vt/log"
	"vitess.io/vitess/go/vt/logutil"
	"vitess.io/vitess/go/vt/topo"
	"vitess.io/vitess/go/vt/vtctl"
	"vitess.io/vitess/go/vt/vtctl/grpcvtctlserver"
	"vitess.io/vitess/go/vt/vtctl/vtctlclient"
	"vitess.io/vitess/go/vt/vttablet/tmclient"
	"vitess.io/vitess/go/vt/wrangler"

	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
	vtctlservicepb "vitess.io/vitess/go/vt/proto/vtctlservice"
)

// startEtcd starts an etcd subprocess, and waits for it to be ready.
func startEtcd2(t *testing.T, shift int) (*exec.Cmd, string, string) {
	// Create a temporary directory.
	dataDir := t.TempDir()

	// Get our two ports to listen to.
	port := testfiles.GoVtTopoEtcd2topoPort + shift
	name := fmt.Sprintf("vitess_unit_test%v", shift)
	clientAddr := fmt.Sprintf("http://localhost:%v", port)
	peerAddr := fmt.Sprintf("http://localhost:%v", port+1)
	initialCluster := fmt.Sprintf("%v=%v", name, peerAddr)

	cmd := exec.Command("etcd",
		"-name", name,
		"-advertise-client-urls", clientAddr,
		"-initial-advertise-peer-urls", peerAddr,
		"-listen-client-urls", clientAddr,
		"-listen-peer-urls", peerAddr,
		"-initial-cluster", initialCluster,
		"-data-dir", dataDir)
	err := cmd.Start()
	if err != nil {
		t.Fatalf("failed to start etcd: %v", err)
	}

	// Create a client to connect to the created etcd.
	cli, err := clientv3.New(clientv3.Config{
		Endpoints:   []string{clientAddr},
		DialTimeout: 5 * time.Second,
	})
	if err != nil {
		t.Fatalf("newCellClient(%v) failed: %v", clientAddr, err)
	}

	// Wait until we can list "/", or timeout.
	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()
	start := time.Now()
	for {
		if _, err := cli.Get(ctx, "/"); err == nil {
			break
		}
		if time.Since(start) > 10*time.Second {
			t.Fatalf("Failed to start etcd daemon in time")
		}
		time.Sleep(10 * time.Millisecond)
	}

	return cmd, dataDir, clientAddr
}

func TestEtcd2TopoOne(t *testing.T) {
	logger := logutil.NewMemoryLogger()
	ctx := context.Background()

	cmd, dataDir, clientAddr := startEtcd2(t, 100)
	defer func() {
		if err := cmd.Process.Kill(); err != nil {
			log.Errorf("cmd.Process.Kill() failed : %v", err)
		}
		// log error
		if err := cmd.Wait(); err != nil {
			log.Errorf("cmd.wait() failed : %v", err)
		}
		os.RemoveAll(dataDir)
	}()

	cmd2, dataDir2, clientAddr2 := startEtcd2(t, 200)
	stop2 := func() {
		if err := cmd2.Process.Kill(); err != nil {
			log.Errorf("cmd.Process.Kill() failed : %v", err)
		}
		// log error
		if err := cmd2.Wait(); err != nil {
			log.Errorf("cmd.wait() failed : %v", err)
		}
		os.RemoveAll(dataDir2)
	}

	time.Sleep(5 * time.Second)

	LocalCellName := "testas1"
	LocalCellName2 := "testas2"
	testRoot := "/test-9"

	// ---------------------------------------------
	// Create the server on the new root.
	ts, err := topo.OpenServer("etcd2", clientAddr, topo.GlobalCell)
	if err != nil {
		t.Fatalf("OpenServer() failed: %v", err)
	}

	// Listen on a random port.
	listener, err := net.Listen("tcp", ":0")
	if err != nil {
		t.Fatalf("Cannot listen: %v", err)
	}
	defer listener.Close()

	grpcServer := grpc.NewServer()
	vtctlservicepb.RegisterVtctlServer(grpcServer, grpcvtctlserver.NewVtctlServer(ts))

	go grpcServer.Serve(listener)
	defer grpcServer.Stop()

	// ---------------------------------------------
	// Create the CellInfo.
	if err := ts.CreateCellInfo(context.Background(), LocalCellName, &topodatapb.CellInfo{
		ServerAddress: clientAddr,
		Root:          path.Join(testRoot, LocalCellName),
	}); err != nil {
		fmt.Println(fmt.Sprintf("CreateCellInfo() failed: %v", err))
	}

	// Create the CellInfo.
	if err := ts.CreateCellInfo(context.Background(), LocalCellName2, &topodatapb.CellInfo{
		ServerAddress: clientAddr2,
		Root:          path.Join(testRoot, LocalCellName2),
	}); err != nil {
		fmt.Println(fmt.Sprintf("CreateCellInfo() failed: %v", err))
	}

	if err := ts.CreateKeyspace(ctx, "test_keyspace", &topodatapb.Keyspace{}); err != nil {
		fmt.Println(fmt.Sprintf("CreateKeyspace: %v", err))
	}

	if err := ts.CreateShard(ctx, "test_keyspace", "0"); err != nil {
		fmt.Println(fmt.Sprintf("CreateShard: %v", err))
	}

	tablet := &topodatapb.Tablet{
		Alias: &topodatapb.TabletAlias{
			Cell: LocalCellName,
			Uid:  1,
		},
		Hostname:      "localhost",
		MysqlHostname: "localhost",
		PortMap: map[string]int32{
			"vt": 3333,
		},
		Shard:    "0",
		Tags:     map[string]string{"tag": "value"},
		Keyspace: "test_keyspace",
		Type:     topodatapb.TabletType_MASTER,
	}
	if err := ts.CreateTablet(ctx, tablet); err != nil {
		fmt.Println(fmt.Sprintf("CreateTablet: %v", err))
	}

	tablet = &topodatapb.Tablet{
		Alias: &topodatapb.TabletAlias{
			Cell: LocalCellName2,
			Uid:  2,
		},
		Hostname:      "localhost",
		MysqlHostname: "localhost",
		PortMap: map[string]int32{
			"vt": 3334,
		},
		Shard:    "0",
		Tags:     map[string]string{"tag": "value"},
		Keyspace: "test_keyspace",
		Type:     topodatapb.TabletType_REPLICA,
	}
	if err := ts.CreateTablet(ctx, tablet); err != nil {
		fmt.Println(fmt.Sprintf("CreateTablet: %v", err))
	}
	err = ExecVtctl(ctx, logger, listener.Addr().String(), []string{"ListShardTablets", "test_keyspace/0"})
	fmt.Printf(
		"%s\n%s\n%s\n%s\n",
		"=======================vtctlclient==============ListShardTablets=======================================\n",
		logger.String(),
		err,
		"=======================================================================================================\n",
	)
	logger.Clear()
	stop2()
	fmt.Println("Stopped etcd")
	err = ExecVtctl(ctx, logger, listener.Addr().String(), []string{"ListShardTablets", "test_keyspace/0"})
	fmt.Printf(
		"%s\n%s\n%s\n%s\n",
		"=======================vtctlclient==============ListShardTablets==========when down====================\n",
		logger.String(),
		err,
		"=======================================================================================================\n",
	)
	ts.Close()
}

func TestEtcd2TopoTwo(t *testing.T) {

	logger := logutil.NewMemoryLogger()

	ctx := context.Background()

	cmd, dataDir, clientAddr := startEtcd2(t, 100)
	defer func() {
		if err := cmd.Process.Kill(); err != nil {
			log.Errorf("cmd.Process.Kill() failed : %v", err)
		}
		// log error
		if err := cmd.Wait(); err != nil {
			log.Errorf("cmd.wait() failed : %v", err)
		}
		os.RemoveAll(dataDir)
	}()

	cmd2, dataDir2, clientAddr2 := startEtcd2(t, 200)
	stop2 := func() {
		if err := cmd2.Process.Kill(); err != nil {
			log.Errorf("cmd.Process.Kill() failed : %v", err)
		}
		// log error
		if err := cmd2.Wait(); err != nil {
			log.Errorf("cmd.wait() failed : %v", err)
		}
		os.RemoveAll(dataDir2)
	}

	time.Sleep(5 * time.Second)

	LocalCellName := "testas1"
	LocalCellName2 := "testas2"

	testRoot := "/test-9"

	// ---------------------------------------------
	// Create the server on the new root.
	ts, err := topo.OpenServer("etcd2", clientAddr, topo.GlobalCell)
	if err != nil {
		t.Fatalf("OpenServer() failed: %v", err)
	}
	// ---------------------------------------------
	// Create the CellInfo.
	if err := ts.CreateCellInfo(context.Background(), LocalCellName, &topodatapb.CellInfo{
		ServerAddress: clientAddr,
		Root:          path.Join(testRoot, LocalCellName),
	}); err != nil {
		fmt.Println(fmt.Sprintf("CreateCellInfo() failed: %v", err))
	}

	// Create the CellInfo.
	if err := ts.CreateCellInfo(context.Background(), LocalCellName2, &topodatapb.CellInfo{
		ServerAddress: clientAddr2,
		Root:          path.Join(testRoot, LocalCellName2),
	}); err != nil {
		fmt.Println(fmt.Sprintf("CreateCellInfo() failed: %v", err))
	}
	// testKeyspaceLock(t, ts)

	if err := ts.CreateKeyspace(ctx, "test_keyspace", &topodatapb.Keyspace{}); err != nil {
		fmt.Println(fmt.Sprintf("CreateKeyspace: %v", err))
	}

	if err := ts.CreateShard(ctx, "test_keyspace", "0"); err != nil {
		fmt.Println(fmt.Sprintf("CreateShard: %v", err))
	}

	tablet := &topodatapb.Tablet{
		Alias: &topodatapb.TabletAlias{
			Cell: LocalCellName,
			Uid:  1,
		},
		Hostname:      "localhost",
		MysqlHostname: "localhost",
		PortMap: map[string]int32{
			"vt": 3333,
		},
		Shard:    "0",
		Tags:     map[string]string{"tag": "value"},
		Keyspace: "test_keyspace",
		Type:     topodatapb.TabletType_MASTER,
	}
	if err := ts.CreateTablet(ctx, tablet); err != nil {
		fmt.Println(fmt.Sprintf("CreateTablet: %v", err))
	}

	tablet = &topodatapb.Tablet{
		Alias: &topodatapb.TabletAlias{
			Cell: LocalCellName2,
			Uid:  2,
		},
		Hostname:      "localhost",
		MysqlHostname: "localhost",
		PortMap: map[string]int32{
			"vt": 3334,
		},
		Shard:    "0",
		Tags:     map[string]string{"tag": "value"},
		Keyspace: "test_keyspace",
		Type:     topodatapb.TabletType_REPLICA,
	}
	if err := ts.CreateTablet(ctx, tablet); err != nil {
		fmt.Println(fmt.Sprintf("CreateTablet: %v", err))
	}

	ts, err = topo.OpenServer("etcd2", clientAddr, topo.GlobalCell)
	if err != nil {
		t.Fatalf("OpenServer() failed: %v", err)
	}
	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Hour)
	wr := wrangler.New(logger, ts, tmclient.NewTabletManagerClient())
	vtctl.RunCommand(ctx, wr, []string{"ListShardTablets", "test_keyspace/0"})
	cancel()
	fmt.Printf(
		"%s\n%s\n%s\n",
		"=======================vtctl====================ListShardTablets=======================================\n",
		logger.String(),
		"=======================================================================================================\n",
	)
	logger.Clear()
	stop2()
	fmt.Println("stopped etcd 2")
	ts, err = topo.OpenServer("etcd2", clientAddr, topo.GlobalCell)
	if err != nil {
		t.Fatalf("OpenServer() failed: %v", err)
	}
	ctx, cancel = context.WithTimeout(context.Background(), 1*time.Hour)
	wr = wrangler.New(logger, ts, tmclient.NewTabletManagerClient())
	vtctl.RunCommand(ctx, wr, []string{"ListShardTablets", "test_keyspace/0"})
	cancel()
	fmt.Printf(
		"%s\n%s\n%s\n",
		"=======================vtctl====================ListShardTablets============when down==================\n",
		logger.String(),
		"=======================================================================================================\n",
	)
	ts.Close()
}

func ExecVtctl(ctx context.Context, logger logutil.Logger, hostport string, args []string) error {
	client, err := vtctlclient.New(hostport)
	if err != nil {
		log.Warning(err)
		return nil
	}
	defer client.Close()

	stream, err := client.ExecuteVtctlCommand(ctx, args, 2*time.Second)
	if err != nil {
		return fmt.Errorf("execvtctl cannot execute remote command: %v on host %v", err, hostport)
	}

	for {
		e, err := stream.Recv()
		switch err {
		case nil:
			logger.Infof("Exec %v result: %s", args, e.Value)
		case io.EOF:
			return nil
		default:
			return fmt.Errorf("remote error: %v", err)
		}
	}
}
Unit test results
# TestEtcd2TopoOne
=======================vtctlclient==============ListShardTablets=======================================

I0817 07:59:53.478326 api2_test.go:378] Exec [ListShardTablets test_keyspace/0] result: testas1-0000000001 test_keyspace 0 primary localhost:3333 localhost:0 [tag: "value"] <null>

I0817 07:59:53.478344 api2_test.go:378] Exec [ListShardTablets test_keyspace/0] result: testas2-0000000002 test_keyspace 0 replica localhost:3334 localhost:0 [tag: "value"] <null>


%!s(<nil>)
=======================================================================================================
....
Stopped etcd
......
{"level":"warn","ts":"2023-08-17T11:00:08.526855+0300","logger":"etcd-client","caller":"[email protected]/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0x14000208540/localhost:6900","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = latest balancer error: last connection error: connection error: desc = \"transport: Error while dialing: dial tcp [::1]:6900: connect: connection refused\""}
W0817 11:00:08.527289   92767 shard.go:631] FindAllTabletAliasesInShard(test_keyspace,0): got partial result: GetShardReplication(testas2, test_keyspace, 0) failed.: deadline exceeded: /test-9/testas2/keyspaces/test_keyspace/shards/0/ShardReplication
{"level":"warn","ts":"2023-08-17T11:00:08.527443+0300","logger":"etcd-client","caller":"[email protected]/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0x140000f3a40/localhost:6800","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = context deadline exceeded"}
E0817 11:00:08.527501   92767 tablet.go:233] Unable to get connection for cell testas1
W0817 11:00:08.527512   92767 tablet.go:531] cell:"testas1" uid:1: deadline exceeded: global/cells/testas1/CellInfo
W0817 11:00:08.527659   92767 server.go:1778] GetTablets encountered non-fatal error GetTabletMapForShard(test_keyspace, 0) failed: partial result: ; continuing because Strict=false


=======================vtctlclient==============ListShardTablets==========when down====================


%!s(<nil>)
=======================================================================================================



# TestEtcd2TopoTwo
=======================vtctl====================ListShardTablets=======================================

testas1-0000000001 test_keyspace 0 primary localhost:3333 localhost:0 [tag: "value"] <null>

testas2-0000000002 test_keyspace 0 replica localhost:3334 localhost:0 [tag: "value"] <null>


=======================================================================================================
......
stopped etcd 2
......
W0817 11:00:20.664647   92767 shard.go:631] FindAllTabletAliasesInShard(test_keyspace,0): got partial result: GetShardReplication(testas2, test_keyspace, 0) failed.: failed to create topo connection to http://localhost:6900, /test-9/testas2: context deadline exceeded
W0817 11:00:20.665635   92767 server.go:1778] GetTablets encountered non-fatal error GetTabletMapForShard(test_keyspace, 0) failed: partial result: 0; continuing because Strict=false
=======================vtctl====================ListShardTablets============when down==================

testas1-0000000001 test_keyspace 0 primary localhost:3333 localhost:0 [tag: "value"] <null>


=======================================================================================================

.....
E0817 11:00:20.708992   92767 tablet.go:233] Unable to get connection for cell nonexistent
E0817 11:00:20.723369   92767 api.go:120] HTTP error on /api/vtctl/: uncaught panic: this command panics on purpose, request: &http.Request{Method:"POST", URL:(*url.URL)(0x140006a7ef0), Proto:"HTTP/1.1", ProtoMajor:1, ProtoMinor:1, Header:http.Header{"Accept-Encoding":[]string{"gzip"}, "Content-Length":[]string{"9"}, "Content-Type":[]string{"application/json"}, "User-Agent":[]string{"Go-http-client/1.1"}}, Body:(*http.body)(0x14001205980), GetBody:(func() (io.ReadCloser, error))(nil), ContentLength:9, TransferEncoding:[]string(nil), Close:false, Host:"127.0.0.1:55464", Form:url.Values(nil), PostForm:url.Values(nil), MultipartForm:(*multipart.Form)(nil), Trailer:http.Header(nil), RemoteAddr:"127.0.0.1:55465", RequestURI:"/api/vtctl/", TLS:(*tls.ConnectionState)(nil), Cancel:(<-chan struct {})(nil), Response:(*http.Response)(nil), ctx:(*context.cancelCtx)(0x1400121e690)}
Binary Version
This can be reproduced on `v11` and on current master branch (a30b19e28b2aea7a98a2909a4d1a73b822251e66)

Operating System and Environment details

centos/mac/rocky

Log Fragments

No response

@DeathBorn DeathBorn added Needs Triage This issue needs to be correctly labelled and triaged Type: Bug labels Aug 17, 2023
@rohit-nayak-ps rohit-nayak-ps added Component: Cluster management and removed Needs Triage This issue needs to be correctly labelled and triaged labels Aug 21, 2023
@DeathBorn
Copy link
Contributor Author

DeathBorn commented Apr 24, 2024

https://github.com/vitessio/vitess/blob/v19.0.3/go/vt/topo/tablet.go#L553
when default 30s contex timeout is reached, this error chages from partial result to context deadline exceeded even for successful tabletInfo result.
When ctx changed to basic context.Background, I received partial results

If endpoint does not have cached etcd connection, then timout of 5s is used to connect to etcd and fail early and then endpoint returns partial results

@deepthi
Copy link
Member

deepthi commented Apr 24, 2024

Can you enumerate which commands exactly you used to test this on v19? The old List...Tablets commands have been replaced by GetTablets.
I ask because since v13, we have used prefix-based range scans to get all the tablets for a cell in 1 topo call at least for ListAllTablets, and we don't go through GetTabletMap -> GetTablet at all except in the rare case where the grpc message size becomes too large if we try to fetch all the tablets in one call.
You could pick that change into your fork to solve your current problem, but I would also like to know if some variants of GetTablets are still vulnerable to this issue in supported releases.

@DeathBorn
Copy link
Contributor Author

Just slightly modified tests provided in reproduction steps are used. The code is collapsed in issue definition.

// Service Vtctl allows you to call vt commands through gRPC.
service Vtctl {
  rpc ExecuteVtctlCommand (vtctldata.ExecuteVtctlCommandRequest) returns (stream vtctldata.ExecuteVtctlCommandResponse) {};
}

What is basically happening in test:

  • bootstrap 2 etcds for 2 cells
  • bootstrap single vtctld using first etcd as global
  • configure 2 cells
  • create unsharded keyspace with 1 tablet per cell
  • use ListShardTablets via ExecuteVtctlCommand -> we get all tablets
  • shutdown second etcd
  • use ListShardTablets via ExecuteVtctlCommand -> context deadline after 30s timeout with empty results

Some of our apps (rails/go) use Vtctld GRPC endpoint to discover cluster metadata. In case cell is down, this command fails and returns empty results.

I don't see how GetTablets can be used from outside.

@deepthi
Copy link
Member

deepthi commented Apr 25, 2024

My question was specifically about v19. Because you linked to code from that version. I thought that you must have tested that version. Docs for GetTablets:
https://vitess.io/docs/19.0/reference/programs/vtctldclient/vtctldclient_gettablets/
EDIT: I think we should look into this anyway, so thanks for creating the issue.

@mattlord
Copy link
Contributor

mattlord commented May 1, 2024

@DeathBorn to be clear, vtctl and vtctlclient have both been deprecated since v18 and we've been talking about this since v14. vtctldclient uses the vtctlservice vtctld RPCs that you can use directly. The specific one you mentioned, ExecuteVtctlCommand should not be used. That was a bridge until all of the client commands had been implemented in vtctldclient. The GetTablets RPC can be used directly against a vtctld server. You can see how vtctldclient uses it here: https://github.com/vitessio/vitess/blob/release-19.0/go/cmd/vtctldclient/command/tablets.go

Now, having said all of that... the behavior you noted is an issue that exists with the current client code and RPCs on main/v20 today. And I agree that it's not ideal. But I also don't think that returning a partial result without VERY clear indications that it is indeed a partial result is good (and it's arguably a good idea to require the client to explicitly request/accept partial results). So the partial results — certainly at least when explicitly asked for — along with some kind of warning makes sense to me.

Did you have anything specific in mind? Using vtctldclient GetTablets is an easy test because it would be the same when using the RPC directly.

@mattlord
Copy link
Contributor

mattlord commented May 2, 2024

In looking at this:

It seems to me that by default (--strict=false) we should already be showing partial results. That would make this more obviously a bug in the new client if in fact we're not.

In looking at the server side code, we should indeed already log a warning and return partial results by default:

// GetTablets is part of the vtctlservicepb.VtctldServer interface.
func (s *VtctldServer) GetTablets(ctx context.Context, req *vtctldatapb.GetTabletsRequest) (resp *vtctldatapb.GetTabletsResponse, err error) {
span, ctx := trace.NewSpan(ctx, "VtctldServer.GetTablets")
defer span.Finish()
defer panicHandler(&err)
span.Annotate("cells", strings.Join(req.Cells, ","))
if req.Keyspace != "" {
span.Annotate("keyspace", req.Keyspace)
}
if req.TabletType != topodatapb.TabletType_UNKNOWN {
span.Annotate("tablet_type", topodatapb.TabletType_name[int32(req.TabletType)])
}
span.Annotate("strict", req.Strict)
// It is possible that an old primary has not yet updated its type in the
// topo. In that case, report its type as UNKNOWN. It used to be PRIMARY but
// is no longer the serving primary.
adjustTypeForStalePrimary := func(ti *topo.TabletInfo, mtst time.Time) {
if ti.Type == topodatapb.TabletType_PRIMARY && ti.GetPrimaryTermStartTime().Before(mtst) {
ti.Tablet.Type = topodatapb.TabletType_UNKNOWN
}
}
// Create a context for our per-cell RPCs, with a timeout upper-bounded at
// the RemoteOperationTimeout.
//
// Per-cell goroutines may also cancel this context if they fail and the
// request specified Strict=true to allow us to fail faster.
ctx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
defer cancel()
var tabletMap map[string]*topo.TabletInfo
switch {
case len(req.TabletAliases) > 0:
span.Annotate("tablet_aliases", strings.Join(topoproto.TabletAliasList(req.TabletAliases).ToStringSlice(), ","))
tabletMap, err = s.ts.GetTabletMap(ctx, req.TabletAliases, nil)
if err != nil {
err = fmt.Errorf("GetTabletMap(%v) failed: %w", req.TabletAliases, err)
}
case req.Keyspace != "" && req.Shard != "":
span.Annotate("shard", req.Shard)
tabletMap, err = s.ts.GetTabletMapForShard(ctx, req.Keyspace, req.Shard)
if err != nil {
err = fmt.Errorf("GetTabletMapForShard(%s, %s) failed: %w", req.Keyspace, req.Shard, err)
}
default:
// goto the req.Cells branch
tabletMap = nil
}
if err != nil {
switch {
case topo.IsErrType(err, topo.PartialResult):
if req.Strict {
return nil, err
}
log.Warningf("GetTablets encountered non-fatal error %s; continuing because Strict=false", err)
default:
return nil, err
}
}
if tabletMap != nil {
var truePrimaryTimestamp time.Time
for _, ti := range tabletMap {
if ti.Type == topodatapb.TabletType_PRIMARY {
primaryTimestamp := ti.GetPrimaryTermStartTime()
if primaryTimestamp.After(truePrimaryTimestamp) {
truePrimaryTimestamp = primaryTimestamp
}
}
}
tablets := make([]*topodatapb.Tablet, 0, len(tabletMap))
for _, ti := range tabletMap {
if req.TabletType != topodatapb.TabletType_UNKNOWN && ti.Type != req.TabletType {
continue
}
adjustTypeForStalePrimary(ti, truePrimaryTimestamp)
tablets = append(tablets, ti.Tablet)
}
return &vtctldatapb.GetTabletsResponse{Tablets: tablets}, nil
}
cells := req.Cells
if len(cells) == 0 {
var c []string
c, err = s.ts.GetKnownCells(ctx)
if err != nil {
return nil, err
}
cells = c
}
var (
m sync.Mutex
wg sync.WaitGroup
rec concurrency.AllErrorRecorder
allTablets []*topo.TabletInfo
)
for _, cell := range cells {
wg.Add(1)
go func(cell string) {
defer wg.Done()
tablets, err := s.ts.GetTabletsByCell(ctx, cell, nil)
if err != nil {
if req.Strict {
log.Infof("GetTablets got an error from cell %s: %s. Running in strict mode, so canceling other cell RPCs", cell, err)
cancel()
}
rec.RecordError(fmt.Errorf("GetTabletsByCell(%s) failed: %w", cell, err))
return
}
m.Lock()
defer m.Unlock()
allTablets = append(allTablets, tablets...)
}(cell)
}
wg.Wait()
if rec.HasErrors() {
if req.Strict || len(rec.Errors) == len(cells) {
err = rec.Error()
return nil, err
}
}
// Collect true primary term start times, and optionally filter out any
// tablets by keyspace according to the request.
PrimaryTermStartTimes := map[string]time.Time{}
filteredTablets := make([]*topo.TabletInfo, 0, len(allTablets))
for _, tablet := range allTablets {
if req.Keyspace != "" && tablet.Keyspace != req.Keyspace {
continue
}
if req.TabletType != topodatapb.TabletType_UNKNOWN && tablet.Type != req.TabletType {
continue
}
key := tablet.Keyspace + "." + tablet.Shard
if v, ok := PrimaryTermStartTimes[key]; ok {
if tablet.GetPrimaryTermStartTime().After(v) {
PrimaryTermStartTimes[key] = tablet.GetPrimaryTermStartTime()
}
} else {
PrimaryTermStartTimes[key] = tablet.GetPrimaryTermStartTime()
}
filteredTablets = append(filteredTablets, tablet)
}
adjustedTablets := make([]*topodatapb.Tablet, len(filteredTablets))
// collect the tablets with adjusted primary term start times. they've
// already been filtered by the above loop, so no keyspace filtering
// here.
for i, ti := range filteredTablets {
key := ti.Keyspace + "." + ti.Shard
adjustTypeForStalePrimary(ti, PrimaryTermStartTimes[key])
adjustedTablets[i] = ti.Tablet
}
return &vtctldatapb.GetTabletsResponse{
Tablets: adjustedTablets,
}, nil
}

So I'll have to work on a test that is based on vtctldclient and VtctldServer if there isn't already one.

@mattlord
Copy link
Contributor

mattlord commented May 2, 2024

So far, the new tests have confirmed that we do actually show partial results by default: #15829

I'll add some additional ones but assuming they all confirm that we're already doing what you want in v14+, I think that we should close this as fixed when that PR with new tests is merged. Please let me know if you feel that I'm missing or misunderstanding anything @DeathBorn .

@mattlord
Copy link
Contributor

mattlord commented May 2, 2024

@DeathBorn a related aside/FYI, in v19+ the "new vtctl" is vtctldclient --server internal: #14315

https://vitess.io/docs/19.0/reference/programs/vtctldclient/#synopsis

You'll then have a vtctldclient process that uses a bundled/internal VtctldServer implementation and no external vtctld server is needed.

@DeathBorn
Copy link
Contributor Author

@mattlord We have some non golang services which depend on vtctld grpc api to perform some actions. Vtctld does not expose the "new" api. Do you have plans to change it ?

@mattlord
Copy link
Contributor

mattlord commented May 2, 2024

@DeathBorn I don't think I understand. Perhaps you're not including it in the --service_map flag for the vtctld instance? Like this:

vtctld ... --service_map 'grpc-vtctl,grpc-vtctld' ...

@mattlord
Copy link
Contributor

mattlord commented May 2, 2024

@DeathBorn Here's an example usage:

❯ grpcurl -plaintext -format json -import-path ${VTROOT}/proto -proto vtctlservice.proto \
  -d '{"keyspace":"commerce", "shard":"0", "tablet_type":"PRIMARY"}' \
  localhost:15999 vtctlservice.Vtctld/GetTablets
{
  "tablets": [
    {
      "alias": {
        "cell": "zone1",
        "uid": 100
      },
      "hostname": "localhost",
      "portMap": {
        "grpc": 16100,
        "vt": 15100
      },
      "keyspace": "commerce",
      "shard": "0",
      "type": "PRIMARY",
      "mysqlHostname": "localhost",
      "mysqlPort": 17100,
      "primaryTermStartTime": {
        "seconds": "1714650530",
        "nanoseconds": 641981000
      },
      "defaultConnCollation": 255
    }
  ]
}

grpcurl also happens to be written in go though: https://github.com/fullstorydev/grpcurl

I'm happy to make some changes related to this if they are needed, but I'm not sure what the issue may be.

@DeathBorn
Copy link
Contributor Author

@DeathBorn I don't think I understand. Perhaps you're not including it in the --service_map flag for the vtctld instance? Like this:

vtctld ... --service_map 'grpc-vtctl,grpc-vtctld' ...

🤦 exactly this.... I have this grpc-vtctl
Thanks a lot!

We can close with additional tests added. 🙇

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants