From cc61316e84a2142a1ad424fac9c9edac2a674c0c Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Tue, 10 Jan 2023 16:09:14 +0530 Subject: [PATCH 1/9] VTOrc running PRS when database_instance empty bug fix. (#12019) * feat: convert join with database_instance to a left join and prevent fixes from running if the information from database_instance is unavailable Signed-off-by: Manan Gupta * test: add tests to verify the fix works Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/vt/vtorc/db/db.go | 9 ++ go/vt/vtorc/inst/analysis_dao.go | 7 +- go/vt/vtorc/inst/analysis_dao_test.go | 140 +++++++++++++++++++++++++- go/vt/vtorc/test/recovery_analysis.go | 2 + 4 files changed, 156 insertions(+), 2 deletions(-) diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go index d3cf83b871d..2d11f7f2332 100644 --- a/go/vt/vtorc/db/db.go +++ b/go/vt/vtorc/db/db.go @@ -154,6 +154,15 @@ func deployStatements(db *sql.DB, queries []string) error { return nil } +// ClearVTOrcDatabase is used to clear the VTOrc database. This function is meant to be used by tests to clear the +// database to get a clean slate without starting a new one. +func ClearVTOrcDatabase() { + db, _, _ := sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + if db != nil { + _ = initVTOrcDB(db) + } +} + // initVTOrcDB attempts to create/upgrade the vtorc backend database. It is created once in the // application's lifetime. func initVTOrcDB(db *sql.DB) error { diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 638d9885f10..3a8d13991a0 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -79,6 +79,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) vitess_keyspace.keyspace_type AS keyspace_type, vitess_keyspace.durability_policy AS durability_policy, primary_instance.read_only AS read_only, + MIN(primary_instance.hostname) IS NULL AS is_invalid, MIN(primary_instance.data_center) AS data_center, MIN(primary_instance.region) AS region, MIN(primary_instance.physical_environment) AS physical_environment, @@ -294,7 +295,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) JOIN vitess_keyspace ON ( vitess_tablet.keyspace = vitess_keyspace.keyspace ) - JOIN database_instance primary_instance ON ( + LEFT JOIN database_instance primary_instance ON ( vitess_tablet.hostname = primary_instance.hostname AND vitess_tablet.port = primary_instance.port ) @@ -475,6 +476,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) // We failed to load the durability policy, so we shouldn't run any analysis return nil } + isInvalid := m.GetBool("is_invalid") + if isInvalid { + return nil + } if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 { a.Analysis = DeadPrimaryWithoutReplicas a.Description = "Primary cannot be reached by vtorc and has no replica" diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index 190c9e0a749..1ee0d2a5ba5 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -28,7 +28,10 @@ import ( "vitess.io/vitess/go/vt/vtorc/test" ) -func TestGetReplicationAnalysis(t *testing.T) { +// TestGetReplicationAnalysisDecision tests the code of GetReplicationAnalysis decision-making. It doesn't check the SQL query +// run by it. It only checks the analysis part after the rows have been read. This tests fakes the db and explicitly returns the +// rows that are specified in the test. +func TestGetReplicationAnalysisDecision(t *testing.T) { tests := []struct { name string info []*test.InfoForRecoveryAnalysis @@ -519,10 +522,54 @@ func TestGetReplicationAnalysis(t *testing.T) { keyspaceWanted: "ks", shardWanted: "0", codeWanted: NoProblem, + }, { + // If the database_instance table for a tablet is empty (discovery of MySQL information hasn't happened yet or failed) + // then we shouldn't run a failure fix on it until the discovery succeeds + name: "Empty database_instance table", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + DurabilityPolicy: "semi_sync", + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + SemiSyncPrimaryEnabled: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + IsInvalid: 1, + DurabilityPolicy: "semi_sync", + }}, + keyspaceWanted: "ks", + shardWanted: "0", + codeWanted: NoProblem, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + oldDB := db.Db + defer func() { + db.Db = oldDB + }() + var rowMaps []sqlutils.RowMap for _, analysis := range tt.info { analysis.SetValuesFromTabletInfo() @@ -547,3 +594,94 @@ func TestGetReplicationAnalysis(t *testing.T) { }) } } + +// TestGetReplicationAnalysis tests the entire GetReplicationAnalysis. It inserts data into the database and runs the function. +// The database is not faked. This is intended to give more test coverage. This test is more comprehensive but more expensive than TestGetReplicationAnalysisDecision. +// This test is somewhere between a unit test, and an end-to-end test. It is specifically useful for testing situations which are hard to come by in end-to-end test, but require +// real-world data to test specifically. +func TestGetReplicationAnalysis(t *testing.T) { + // The initialSQL is a set of insert commands copied from a dump of an actual running VTOrc instances. The relevant insert commands are here. + // This is a dump taken from a test running 4 tablets, zone1-101 is the primary, zone1-100 is a replica, zone1-112 is a rdonly and zone2-200 is a cross-cell replica. + initialSQL := []string{ + `INSERT INTO database_instance VALUES('localhost',6747,'2022-12-28 07:26:04','2022-12-28 07:26:04',213696377,'8.0.31','ROW',1,1,'vt-0000000112-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000112-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-9240-92a06c3be3c2','2022-12-28 07:26:04','',1,0,0,'zone1-0000000112','Homebrew','8.0','FULL',10816929,0,0,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-9240-92a06c3be3c2',1,1,'',1000000000000000000,1,0,0,0,'',0,'','','[]','',0);`, + `INSERT INTO database_instance VALUES('localhost',6711,'2022-12-28 07:26:04','2022-12-28 07:26:04',1094500338,'8.0.31','ROW',1,1,'vt-0000000100-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000100-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-acf8-d6b0ef9f4eaa','2022-12-28 07:26:04','',1,0,0,'zone1-0000000100','Homebrew','8.0','FULL',10103920,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-acf8-d6b0ef9f4eaa',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, + `INSERT INTO database_instance VALUES('localhost',6714,'2022-12-28 07:26:04','2022-12-28 07:26:04',390954723,'8.0.31','ROW',1,1,'vt-0000000101-bin.000001',15583,'',0,0,0,'',0,'',0,NULL,NULL,0,'','',0,0,'',0,0,0,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a4cc4-8680-11ed-a104-47706090afbd','2022-12-28 07:26:04','',0,0,0,'zone1-0000000101','Homebrew','8.0','FULL',11366095,1,1,'ON',1,'','','729a4cc4-8680-11ed-a104-47706090afbd',-1,-1,'',1000000000000000000,1,1,0,2,'',0,'','','[]','',0);`, + `INSERT INTO database_instance VALUES('localhost',6756,'2022-12-28 07:26:05','2022-12-28 07:26:05',444286571,'8.0.31','ROW',1,1,'vt-0000000200-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000200-relay-bin.000002',15815,0,1,0,'zone2','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a497c-8680-11ed-8ad4-3f51d747db75','2022-12-28 07:26:05','',1,0,0,'zone2-0000000200','Homebrew','8.0','FULL',10443112,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a497c-8680-11ed-8ad4-3f51d747db75',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, + `INSERT INTO vitess_tablet VALUES('localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_keyspace VALUES('ks',0,'semi_sync');`, + } + + // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. + tests := []struct { + name string + sql []string + codeWanted AnalysisCode + shardWanted string + keyspaceWanted string + }{ + { + name: "No additions", + sql: nil, + codeWanted: NoProblem, + }, { + name: "Removing Primary Tablet's Vitess record", + sql: []string{ + // This query removes the primary tablet's vitess_tablet record + `delete from vitess_tablet where port = 6714`, + }, + codeWanted: ClusterHasNoPrimary, + keyspaceWanted: "ks", + shardWanted: "0", + }, { + name: "Removing Primary Tablet's MySQL record", + sql: []string{ + // This query removes the primary tablet's database_instance record + `delete from database_instance where port = 6714`, + }, + // As long as we have the vitess record stating that this tablet is the primary + // It would be incorrect to run a PRS. + // This situation only happens when we haven't been able to read the MySQL information even once for this tablet. + // So it is likely a new tablet. + codeWanted: NoProblem, + }, { + name: "Removing Replica Tablet's MySQL record", + sql: []string{ + // This query removes the replica tablet's database_instance record + `delete from database_instance where port = 6711`, + }, + // As long as we don't have the MySQL information, we shouldn't do anything. + // We should wait for the MySQL information to be refreshed once. + // This situation only happens when we haven't been able to read the MySQL information even once for this tablet. + // So it is likely a new tablet. + codeWanted: NoProblem, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range append(initialSQL, tt.sql...) { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + got, err := GetReplicationAnalysis("", "", &ReplicationAnalysisHints{}) + require.NoError(t, err) + if tt.codeWanted == NoProblem { + require.Len(t, got, 0) + return + } + require.Len(t, got, 1) + require.Equal(t, tt.codeWanted, got[0].Analysis) + require.Equal(t, tt.keyspaceWanted, got[0].AnalyzedKeyspace) + require.Equal(t, tt.shardWanted, got[0].AnalyzedShard) + }) + } +} diff --git a/go/vt/vtorc/test/recovery_analysis.go b/go/vt/vtorc/test/recovery_analysis.go index 07a9219f4c5..f4cdcfd7db9 100644 --- a/go/vt/vtorc/test/recovery_analysis.go +++ b/go/vt/vtorc/test/recovery_analysis.go @@ -35,6 +35,7 @@ type InfoForRecoveryAnalysis struct { Shard string KeyspaceType int DurabilityPolicy string + IsInvalid int IsPrimary int IsCoPrimary int Hostname string @@ -122,6 +123,7 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap { rowMap["is_co_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsCoPrimary), Valid: true} rowMap["is_downtimed"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsDowntimed), Valid: true} rowMap["is_failing_to_connect_to_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsFailingToConnectToPrimary), Valid: true} + rowMap["is_invalid"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsInvalid), Valid: true} rowMap["is_last_check_valid"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LastCheckValid), Valid: true} rowMap["is_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsPrimary), Valid: true} rowMap["is_stale_binlog_coordinates"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsStaleBinlogCoordinates), Valid: true} From 7f738a788e23b9297607cd2945af580a7205b5c5 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Fri, 16 Dec 2022 14:32:02 +0530 Subject: [PATCH 2/9] Timeout Fixes and VTOrc Improvement (#11881) * refactor: move tests out of newfeaturestest so that they run on upgrade-downgrade tests too Signed-off-by: Manan Gupta * feat: add failing ers test for handling multiple vttablet failures with default values of flags Signed-off-by: Manan Gupta * feat: add a new lock-timeout flag and use that instead of remote-operation-timeout Signed-off-by: Manan Gupta * feat: augment DownPrimary test to reproduce the issue of VTOrc not handling multiple failures Signed-off-by: Manan Gupta * feat: remove LockShardTimeout configuration from VTOrc and add parallelism to refresh of tablets Signed-off-by: Manan Gupta * log: add more logging lines around ers in vtorc Signed-off-by: Manan Gupta * test: get the test to work Signed-off-by: Manan Gupta * feat: fix usage of wait for replicas timeout Signed-off-by: Manan Gupta * test: fix flags expected output Signed-off-by: Manan Gupta * test: fix race in test now that the function is called in parallel multiple times Signed-off-by: Manan Gupta * feat: fix default of onCloseTimeout to 1 second Signed-off-by: Manan Gupta * test: add failing unit test to refreshTabletsInKeyspaceShard Signed-off-by: Manan Gupta * feat: fix vtorc to not forget a tablet which has been deleted Signed-off-by: Manan Gupta * feat: fix backward compatibility, add tests and release notes docs Signed-off-by: Manan Gupta * test: fix flags output Signed-off-by: Manan Gupta * test: use disable-replication-manager instead of disable-active-reparents to allow vttablets to setup replication when restarted Signed-off-by: Manan Gupta * test: fix flaky test by not checking for an error Signed-off-by: Manan Gupta * feat: handle the case of empty hostname in tablet initialization Signed-off-by: Manan Gupta * feat: update onclose timeout to 10 seconds Signed-off-by: Manan Gupta * test: fix unit test Signed-off-by: Manan Gupta * feat: address review comments Signed-off-by: Manan Gupta * docs: add comments explaining the test functions Signed-off-by: Manan Gupta * feat: add summary docs for 'lock-shard-timeout' deprecation Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/flags/endtoend/vtbackup.txt | 3 +- go/flags/endtoend/vtctld.txt | 5 +- go/flags/endtoend/vtgate.txt | 3 +- go/flags/endtoend/vtgr.txt | 3 +- go/flags/endtoend/vtorc.txt | 4 +- go/flags/endtoend/vttablet.txt | 3 +- go/internal/flag/flag.go | 11 ++ .../reparent/newfeaturetest/reparent_test.go | 154 +++--------------- .../reparent/plannedreparent/reparent_test.go | 146 ++++++++++++++++- .../primaryfailure/primary_failure_test.go | 12 +- go/vt/topo/locks.go | 35 ++-- go/vt/topo/locks_test.go | 85 ++++++++++ .../reparentutil/emergency_reparenter.go | 2 +- go/vt/vtctl/reparentutil/replication.go | 4 +- go/vt/vtctl/reparentutil/replication_test.go | 22 ++- go/vt/vtorc/config/config.go | 9 +- go/vt/vtorc/config/config_test.go | 15 -- go/vt/vtorc/logic/tablet_discovery.go | 32 ++-- go/vt/vtorc/logic/tablet_discovery_test.go | 31 +++- go/vt/vtorc/logic/topology_recovery.go | 5 + 20 files changed, 374 insertions(+), 210 deletions(-) create mode 100644 go/vt/topo/locks_test.go diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index a588976126d..fc587541d06 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -89,6 +89,7 @@ Usage of vtbackup: --keep-alive-timeout duration Wait until timeout elapses after a successful backup before shutting down. --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory --log_err_stacks log stack traces for errors @@ -122,7 +123,7 @@ Usage of vtbackup: --port int port for the server --pprof strings enable profiling --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --restart_before_backup Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs. --s3_backup_aws_endpoint string endpoint of the S3 backend (region must be provided). --s3_backup_aws_region string AWS region to use. (default "us-east-1") diff --git a/go/flags/endtoend/vtctld.txt b/go/flags/endtoend/vtctld.txt index 906a9160b6f..7ed6b028bb1 100644 --- a/go/flags/endtoend/vtctld.txt +++ b/go/flags/endtoend/vtctld.txt @@ -1,5 +1,5 @@ Usage of vtctld: - --action_timeout duration time to wait for an action before resorting to force (default 2m0s) + --action_timeout duration time to wait for an action before resorting to force (default 1m0s) --alsologtostderr log to standard error as well as files --azblob_backup_account_key_file string Path to a file containing the Azure Storage account key; if this flag is unset, the environment variable VT_AZBLOB_ACCOUNT_KEY will be used as the key itself (NOT a file path). --azblob_backup_account_name string Azure Storage Account name for backups; if this flag is unset, the environment variable VT_AZBLOB_ACCOUNT_NAME will be used. @@ -61,6 +61,7 @@ Usage of vtctld: --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) --lameduck-period duration keep running at least this long after SIGTERM before stopping (default 50ms) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory --log_err_stacks log stack traces for errors @@ -74,7 +75,7 @@ Usage of vtctld: --pprof strings enable profiling --proxy_tablets Setting this true will make vtctld proxy the tablet status instead of redirecting to them --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --s3_backup_aws_endpoint string endpoint of the S3 backend (region must be provided). --s3_backup_aws_region string AWS region to use. (default "us-east-1") --s3_backup_aws_retries int AWS request retries. (default -1) diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt index cbf04bf888c..8564f2b594a 100644 --- a/go/flags/endtoend/vtgate.txt +++ b/go/flags/endtoend/vtgate.txt @@ -69,6 +69,7 @@ Usage of vtgate: --keyspaces_to_watch strings Specifies which keyspaces this vtgate should have access to while routing queries or accessing the vschema. --lameduck-period duration keep running at least this long after SIGTERM before stopping (default 50ms) --legacy_replication_lag_algorithm Use the legacy algorithm when selecting vttablets for serving. (default true) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --lock_heartbeat_time duration If there is lock function used. This will keep the lock connection active by using this heartbeat (default 5s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory @@ -133,7 +134,7 @@ Usage of vtgate: --querylog-format string format for query logs ("text" or "json") (default "text") --querylog-row-threshold uint Number of rows a query has to return or affect before being logged; not useful for streaming queries. 0 means all queries will be logged. --redact-debug-ui-queries redact full queries and bind variables from debug UI - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --retry-count int retry count (default 2) --schema_change_signal Enable the schema tracker; requires queryserver-config-schema-change-signal to be enabled on the underlying vttablets for this to work (default true) --schema_change_signal_user string User to be used to send down query to vttablet to retrieve schema changes diff --git a/go/flags/endtoend/vtgr.txt b/go/flags/endtoend/vtgr.txt index 75e7b0a0fc4..9e0798f9fca 100644 --- a/go/flags/endtoend/vtgr.txt +++ b/go/flags/endtoend/vtgr.txt @@ -22,6 +22,7 @@ Usage of vtgr: -h, --help display usage and exit --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory --log_err_stacks log stack traces for errors @@ -31,7 +32,7 @@ Usage of vtgr: --pprof strings enable profiling --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) --refresh_interval duration Refresh interval to load tablets. (default 10s) - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --scan_interval duration Scan interval to diagnose and repair. (default 3s) --scan_repair_timeout duration Time to wait for a Diagnose and repair operation. (default 3s) --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index 3c57dd3bddf..c445e5f61bf 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -23,7 +23,7 @@ Usage of vtorc: --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) --lameduck-period duration keep running at least this long after SIGTERM before stopping (default 50ms) - --lock-shard-timeout duration Duration for which a shard lock is held when running a recovery (default 30s) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory --log_err_stacks log stack traces for errors @@ -39,7 +39,7 @@ Usage of vtorc: --reasonable-replication-lag duration Maximum replication lag on replicas which is deemed to be acceptable (default 10s) --recovery-period-block-duration duration Duration for which a new recovery is blocked on an instance after running a recovery (default 30s) --recovery-poll-duration duration Timer duration on which VTOrc polls its database to run a recovery (default 1s) - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) --shutdown_wait_time duration Maximum time to wait for VTOrc to release all the locks that it is holding before shutting down on SIGTERM (default 30s) --snapshot-topology-interval duration Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 09badc91a58..3b919eee67c 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -160,6 +160,7 @@ Usage of vttablet: --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) --lameduck-period duration keep running at least this long after SIGTERM before stopping (default 50ms) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) --lock_tables_timeout duration How long to keep the table locked before timing out (default 1m0s) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) --log_dir string If non-empty, write log files in this directory @@ -240,7 +241,7 @@ Usage of vttablet: --redact-debug-ui-queries redact full queries and bind variables from debug UI --relay_log_max_items int Maximum number of rows for VReplication target buffering. (default 5000) --relay_log_max_size int Maximum buffer size (in bytes) for VReplication target buffering. If single rows are larger than this, a single row is buffered at a time. (default 250000) - --remote_operation_timeout duration time to wait for a remote operation (default 30s) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) --replication_connect_retry duration how long to wait in between replica reconnect attempts. Only precise to the second. (default 10s) --restore_concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) --restore_from_backup (init restore parameter) will check BackupStorage for a recent backup at startup and start there diff --git a/go/internal/flag/flag.go b/go/internal/flag/flag.go index 1f55da7b4ee..e126f3c45f5 100644 --- a/go/internal/flag/flag.go +++ b/go/internal/flag/flag.go @@ -70,6 +70,17 @@ func Parse(fs *flag.FlagSet) { flag.Parse() } +// IsFlagProvided returns if the given flag has been provided by the user explicitly or not +func IsFlagProvided(name string) bool { + found := false + flag.Visit(func(f *flag.Flag) { + if f.Name == name { + found = true + } + }) + return found +} + // TrickGlog tricks glog into understanding that flags have been parsed. // // N.B. Do not delete this function. `glog` is a persnickity package and wants diff --git a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go index 394f9f71226..1de1adf0430 100644 --- a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go +++ b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go @@ -19,157 +19,43 @@ package newfeaturetest import ( "context" "fmt" - "strconv" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "google.golang.org/protobuf/encoding/protojson" - - "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/reparent/utils" - replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" ) -// TestCrossCellDurability tests 2 things - -// 1. When PRS is run with the cross_cell durability policy setup, then the semi-sync settings on all the tablets are as expected -// 2. Bringing up a new vttablet should have its replication and semi-sync setup correctly without any external interference -func TestCrossCellDurability(t *testing.T) { - defer cluster.PanicHandler(t) - clusterInstance := utils.SetupReparentCluster(t, "cross_cell") - defer utils.TeardownCluster(clusterInstance) - tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets - - utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) - - // When tablets[0] is the primary, the only tablet in a different cell is tablets[3]. - // So the other two should have semi-sync turned off - utils.CheckSemiSyncSetupCorrectly(t, tablets[0], "ON") - utils.CheckSemiSyncSetupCorrectly(t, tablets[3], "ON") - utils.CheckSemiSyncSetupCorrectly(t, tablets[1], "OFF") - utils.CheckSemiSyncSetupCorrectly(t, tablets[2], "OFF") - - // Run forced reparent operation, this should proceed unimpeded. - out, err := utils.Prs(t, clusterInstance, tablets[3]) - require.NoError(t, err, out) - - utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2]}) - - // All the tablets will have semi-sync setup since tablets[3] is in Cell2 and all - // others are in Cell1, so all of them are eligible to send semi-sync ACKs - for _, tablet := range tablets { - utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON") - } - - for i, supportsBackup := range []bool{false, true} { - // Bring up a new replica tablet - // In this new tablet, we do not disable active reparents, otherwise replication will not be started. - newReplica := utils.StartNewVTTablet(t, clusterInstance, 300+i, supportsBackup) - // Add the tablet to the list of tablets in this shard - clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, newReplica) - // Check that we can replicate to it and semi-sync is setup correctly on it - utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2], newReplica}) - utils.CheckSemiSyncSetupCorrectly(t, newReplica, "ON") - } -} - -// TestFullStatus tests that the RPC FullStatus works as intended. -func TestFullStatus(t *testing.T) { +// TestRecoverWithMultipleVttabletFailures tests that ERS succeeds with the default values +// even when there are multiple vttablet failures. In this test we use the semi_sync policy +// to allow multiple failures to happen and still be recoverable. +// The test takes down the vttablets of the primary and a rdonly tablet and runs ERS with the +// default values of remote_operation_timeout, lock-timeout flags and wait_replicas_timeout subflag. +func TestRecoverWithMultipleVttabletFailures(t *testing.T) { defer cluster.PanicHandler(t) clusterInstance := utils.SetupReparentCluster(t, "semi_sync") defer utils.TeardownCluster(clusterInstance) tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) - // Check that full status gives the correct result for a primary tablet - primaryStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[0].Alias) - require.NoError(t, err) - primaryStatus := &replicationdatapb.FullStatus{} - err = protojson.Unmarshal([]byte(primaryStatusString), primaryStatus) - require.NoError(t, err) - assert.NotEmpty(t, primaryStatus.ServerUuid) - assert.NotEmpty(t, primaryStatus.ServerId) - // For a primary tablet there is no replication status - assert.Nil(t, primaryStatus.ReplicationStatus) - assert.Contains(t, primaryStatus.PrimaryStatus.String(), "vt-0000000101-bin") - assert.Equal(t, primaryStatus.GtidPurged, "MySQL56/") - assert.False(t, primaryStatus.ReadOnly) - assert.True(t, primaryStatus.SemiSyncPrimaryEnabled) - assert.True(t, primaryStatus.SemiSyncReplicaEnabled) - assert.True(t, primaryStatus.SemiSyncPrimaryStatus) - assert.False(t, primaryStatus.SemiSyncReplicaStatus) - assert.EqualValues(t, 3, primaryStatus.SemiSyncPrimaryClients) - assert.EqualValues(t, 1000000000000000000, primaryStatus.SemiSyncPrimaryTimeout) - assert.EqualValues(t, 1, primaryStatus.SemiSyncWaitForReplicaCount) - assert.Equal(t, "ROW", primaryStatus.BinlogFormat) - assert.Equal(t, "FULL", primaryStatus.BinlogRowImage) - assert.Equal(t, "ON", primaryStatus.GtidMode) - assert.True(t, primaryStatus.LogReplicaUpdates) - assert.True(t, primaryStatus.LogBinEnabled) - assert.Regexp(t, `[58]\.[07].*`, primaryStatus.Version) - assert.NotEmpty(t, primaryStatus.VersionComment) - - // Check that full status gives the correct result for a replica tablet - replicaStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[1].Alias) + // make tablets[1] a rdonly tablet. + err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", tablets[1].Alias, "rdonly") require.NoError(t, err) - replicaStatus := &replicationdatapb.FullStatus{} - err = protojson.Unmarshal([]byte(replicaStatusString), replicaStatus) - require.NoError(t, err) - assert.NotEmpty(t, replicaStatus.ServerUuid) - assert.NotEmpty(t, replicaStatus.ServerId) - assert.Contains(t, replicaStatus.ReplicationStatus.Position, "MySQL56/"+replicaStatus.ReplicationStatus.SourceUuid) - assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.IoState) - assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.SqlState) - assert.Equal(t, fileNameFromPosition(replicaStatus.ReplicationStatus.FilePosition), fileNameFromPosition(primaryStatus.PrimaryStatus.FilePosition)) - assert.LessOrEqual(t, rowNumberFromPosition(replicaStatus.ReplicationStatus.FilePosition), rowNumberFromPosition(primaryStatus.PrimaryStatus.FilePosition)) - assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition, primaryStatus.PrimaryStatus.FilePosition) - assert.Contains(t, replicaStatus.ReplicationStatus.RelayLogFilePosition, "vt-0000000102-relay") - assert.Equal(t, replicaStatus.ReplicationStatus.Position, primaryStatus.PrimaryStatus.Position) - assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogPosition, primaryStatus.PrimaryStatus.Position) - assert.Empty(t, replicaStatus.ReplicationStatus.LastIoError) - assert.Empty(t, replicaStatus.ReplicationStatus.LastSqlError) - assert.Equal(t, replicaStatus.ReplicationStatus.SourceUuid, primaryStatus.ServerUuid) - assert.LessOrEqual(t, int(replicaStatus.ReplicationStatus.ReplicationLagSeconds), 1) - assert.False(t, replicaStatus.ReplicationStatus.ReplicationLagUnknown) - assert.EqualValues(t, 0, replicaStatus.ReplicationStatus.SqlDelay) - assert.False(t, replicaStatus.ReplicationStatus.SslAllowed) - assert.False(t, replicaStatus.ReplicationStatus.HasReplicationFilters) - assert.False(t, replicaStatus.ReplicationStatus.UsingGtid) - assert.True(t, replicaStatus.ReplicationStatus.AutoPosition) - assert.Equal(t, replicaStatus.ReplicationStatus.SourceHost, utils.Hostname) - assert.EqualValues(t, replicaStatus.ReplicationStatus.SourcePort, tablets[0].MySQLPort) - assert.Equal(t, replicaStatus.ReplicationStatus.SourceUser, "vt_repl") - assert.Contains(t, replicaStatus.PrimaryStatus.String(), "vt-0000000102-bin") - assert.Equal(t, replicaStatus.GtidPurged, "MySQL56/") - assert.True(t, replicaStatus.ReadOnly) - assert.False(t, replicaStatus.SemiSyncPrimaryEnabled) - assert.True(t, replicaStatus.SemiSyncReplicaEnabled) - assert.False(t, replicaStatus.SemiSyncPrimaryStatus) - assert.True(t, replicaStatus.SemiSyncReplicaStatus) - assert.EqualValues(t, 0, replicaStatus.SemiSyncPrimaryClients) - assert.EqualValues(t, 1000000000000000000, replicaStatus.SemiSyncPrimaryTimeout) - assert.EqualValues(t, 1, replicaStatus.SemiSyncWaitForReplicaCount) - assert.Equal(t, "ROW", replicaStatus.BinlogFormat) - assert.Equal(t, "FULL", replicaStatus.BinlogRowImage) - assert.Equal(t, "ON", replicaStatus.GtidMode) - assert.True(t, replicaStatus.LogReplicaUpdates) - assert.True(t, replicaStatus.LogBinEnabled) - assert.Regexp(t, `[58]\.[07].*`, replicaStatus.Version) - assert.NotEmpty(t, replicaStatus.VersionComment) -} -// fileNameFromPosition gets the file name from the position -func fileNameFromPosition(pos string) string { - return pos[0 : len(pos)-4] -} + // Confirm that replication is still working as intended + utils.ConfirmReplication(t, tablets[0], tablets[1:]) + + // Make the rdonly and primary tablets and databases unavailable. + utils.StopTablet(t, tablets[1], true) + utils.StopTablet(t, tablets[0], true) + + // We expect this to succeed since we only have 1 primary eligible tablet which is down + out, err := utils.Ers(clusterInstance, nil, "", "") + require.NoError(t, err, out) -// rowNumberFromPosition gets the row number from the position -func rowNumberFromPosition(pos string) int { - rowNumStr := pos[len(pos)-4:] - rowNum, _ := strconv.Atoi(rowNumStr) - return rowNum + newPrimary := utils.GetNewPrimary(t, clusterInstance) + utils.ConfirmReplication(t, newPrimary, []*cluster.Vttablet{tablets[2], tablets[3]}) } // TestTabletRestart tests that a running tablet can be restarted and everything is still fine diff --git a/go/test/endtoend/reparent/plannedreparent/reparent_test.go b/go/test/endtoend/reparent/plannedreparent/reparent_test.go index d33992d5a81..c2f2b948d71 100644 --- a/go/test/endtoend/reparent/plannedreparent/reparent_test.go +++ b/go/test/endtoend/reparent/plannedreparent/reparent_test.go @@ -19,17 +19,20 @@ package plannedreparent import ( "context" "fmt" + "strconv" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "google.golang.org/protobuf/encoding/protojson" - utilstest "vitess.io/vitess/go/test/endtoend/utils" - + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/reparent/utils" + utilstest "vitess.io/vitess/go/test/endtoend/utils" "vitess.io/vitess/go/vt/log" + replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" ) func TestPrimaryToSpareStateChangeImpossible(t *testing.T) { @@ -384,3 +387,142 @@ func TestReparentDoesntHangIfPrimaryFails(t *testing.T) { require.Error(t, err) assert.Contains(t, out, "primary failed to PopulateReparentJournal") } + +// TestCrossCellDurability tests 2 things - +// 1. When PRS is run with the cross_cell durability policy setup, then the semi-sync settings on all the tablets are as expected +// 2. Bringing up a new vttablet should have its replication and semi-sync setup correctly without any manual intervention +func TestCrossCellDurability(t *testing.T) { + defer cluster.PanicHandler(t) + clusterInstance := utils.SetupReparentCluster(t, "cross_cell") + defer utils.TeardownCluster(clusterInstance) + tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets + + utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) + + // When tablets[0] is the primary, the only tablet in a different cell is tablets[3]. + // So the other two should have semi-sync turned off + utils.CheckSemiSyncSetupCorrectly(t, tablets[0], "ON") + utils.CheckSemiSyncSetupCorrectly(t, tablets[3], "ON") + utils.CheckSemiSyncSetupCorrectly(t, tablets[1], "OFF") + utils.CheckSemiSyncSetupCorrectly(t, tablets[2], "OFF") + + // Run forced reparent operation, this should proceed unimpeded. + out, err := utils.Prs(t, clusterInstance, tablets[3]) + require.NoError(t, err, out) + + utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2]}) + + // All the tablets will have semi-sync setup since tablets[3] is in Cell2 and all + // others are in Cell1, so all of them are eligible to send semi-sync ACKs + for _, tablet := range tablets { + utils.CheckSemiSyncSetupCorrectly(t, tablet, "ON") + } + + for i, supportsBackup := range []bool{false, true} { + // Bring up a new replica tablet + // In this new tablet, we do not disable active reparents, otherwise replication will not be started. + newReplica := utils.StartNewVTTablet(t, clusterInstance, 300+i, supportsBackup) + // Add the tablet to the list of tablets in this shard + clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, newReplica) + // Check that we can replicate to it and semi-sync is setup correctly on it + utils.ConfirmReplication(t, tablets[3], []*cluster.Vttablet{tablets[0], tablets[1], tablets[2], newReplica}) + utils.CheckSemiSyncSetupCorrectly(t, newReplica, "ON") + } +} + +// TestFullStatus tests that the RPC FullStatus works as intended. +func TestFullStatus(t *testing.T) { + defer cluster.PanicHandler(t) + clusterInstance := utils.SetupReparentCluster(t, "semi_sync") + defer utils.TeardownCluster(clusterInstance) + tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets + utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) + + // Check that full status gives the correct result for a primary tablet + primaryStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[0].Alias) + require.NoError(t, err) + primaryStatus := &replicationdatapb.FullStatus{} + err = protojson.Unmarshal([]byte(primaryStatusString), primaryStatus) + require.NoError(t, err) + assert.NotEmpty(t, primaryStatus.ServerUuid) + assert.NotEmpty(t, primaryStatus.ServerId) + // For a primary tablet there is no replication status + assert.Nil(t, primaryStatus.ReplicationStatus) + assert.Contains(t, primaryStatus.PrimaryStatus.String(), "vt-0000000101-bin") + assert.Equal(t, primaryStatus.GtidPurged, "MySQL56/") + assert.False(t, primaryStatus.ReadOnly) + assert.True(t, primaryStatus.SemiSyncPrimaryEnabled) + assert.True(t, primaryStatus.SemiSyncReplicaEnabled) + assert.True(t, primaryStatus.SemiSyncPrimaryStatus) + assert.False(t, primaryStatus.SemiSyncReplicaStatus) + assert.EqualValues(t, 3, primaryStatus.SemiSyncPrimaryClients) + assert.EqualValues(t, 1000000000000000000, primaryStatus.SemiSyncPrimaryTimeout) + assert.EqualValues(t, 1, primaryStatus.SemiSyncWaitForReplicaCount) + assert.Equal(t, "ROW", primaryStatus.BinlogFormat) + assert.Equal(t, "FULL", primaryStatus.BinlogRowImage) + assert.Equal(t, "ON", primaryStatus.GtidMode) + assert.True(t, primaryStatus.LogReplicaUpdates) + assert.True(t, primaryStatus.LogBinEnabled) + assert.Regexp(t, `[58]\.[07].*`, primaryStatus.Version) + assert.NotEmpty(t, primaryStatus.VersionComment) + + // Check that full status gives the correct result for a replica tablet + replicaStatusString, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("GetFullStatus", tablets[1].Alias) + require.NoError(t, err) + replicaStatus := &replicationdatapb.FullStatus{} + err = protojson.Unmarshal([]byte(replicaStatusString), replicaStatus) + require.NoError(t, err) + assert.NotEmpty(t, replicaStatus.ServerUuid) + assert.NotEmpty(t, replicaStatus.ServerId) + assert.Contains(t, replicaStatus.ReplicationStatus.Position, "MySQL56/"+replicaStatus.ReplicationStatus.SourceUuid) + assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.IoState) + assert.EqualValues(t, mysql.ReplicationStateRunning, replicaStatus.ReplicationStatus.SqlState) + assert.Equal(t, fileNameFromPosition(replicaStatus.ReplicationStatus.FilePosition), fileNameFromPosition(primaryStatus.PrimaryStatus.FilePosition)) + assert.LessOrEqual(t, rowNumberFromPosition(replicaStatus.ReplicationStatus.FilePosition), rowNumberFromPosition(primaryStatus.PrimaryStatus.FilePosition)) + assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogSourceBinlogEquivalentPosition, primaryStatus.PrimaryStatus.FilePosition) + assert.Contains(t, replicaStatus.ReplicationStatus.RelayLogFilePosition, "vt-0000000102-relay") + assert.Equal(t, replicaStatus.ReplicationStatus.Position, primaryStatus.PrimaryStatus.Position) + assert.Equal(t, replicaStatus.ReplicationStatus.RelayLogPosition, primaryStatus.PrimaryStatus.Position) + assert.Empty(t, replicaStatus.ReplicationStatus.LastIoError) + assert.Empty(t, replicaStatus.ReplicationStatus.LastSqlError) + assert.Equal(t, replicaStatus.ReplicationStatus.SourceUuid, primaryStatus.ServerUuid) + assert.LessOrEqual(t, int(replicaStatus.ReplicationStatus.ReplicationLagSeconds), 1) + assert.False(t, replicaStatus.ReplicationStatus.ReplicationLagUnknown) + assert.EqualValues(t, 0, replicaStatus.ReplicationStatus.SqlDelay) + assert.False(t, replicaStatus.ReplicationStatus.SslAllowed) + assert.False(t, replicaStatus.ReplicationStatus.HasReplicationFilters) + assert.False(t, replicaStatus.ReplicationStatus.UsingGtid) + assert.True(t, replicaStatus.ReplicationStatus.AutoPosition) + assert.Equal(t, replicaStatus.ReplicationStatus.SourceHost, utils.Hostname) + assert.EqualValues(t, replicaStatus.ReplicationStatus.SourcePort, tablets[0].MySQLPort) + assert.Equal(t, replicaStatus.ReplicationStatus.SourceUser, "vt_repl") + assert.Contains(t, replicaStatus.PrimaryStatus.String(), "vt-0000000102-bin") + assert.Equal(t, replicaStatus.GtidPurged, "MySQL56/") + assert.True(t, replicaStatus.ReadOnly) + assert.False(t, replicaStatus.SemiSyncPrimaryEnabled) + assert.True(t, replicaStatus.SemiSyncReplicaEnabled) + assert.False(t, replicaStatus.SemiSyncPrimaryStatus) + assert.True(t, replicaStatus.SemiSyncReplicaStatus) + assert.EqualValues(t, 0, replicaStatus.SemiSyncPrimaryClients) + assert.EqualValues(t, 1000000000000000000, replicaStatus.SemiSyncPrimaryTimeout) + assert.EqualValues(t, 1, replicaStatus.SemiSyncWaitForReplicaCount) + assert.Equal(t, "ROW", replicaStatus.BinlogFormat) + assert.Equal(t, "FULL", replicaStatus.BinlogRowImage) + assert.Equal(t, "ON", replicaStatus.GtidMode) + assert.True(t, replicaStatus.LogReplicaUpdates) + assert.True(t, replicaStatus.LogBinEnabled) + assert.Regexp(t, `[58]\.[07].*`, replicaStatus.Version) + assert.NotEmpty(t, replicaStatus.VersionComment) +} + +// fileNameFromPosition gets the file name from the position +func fileNameFromPosition(pos string) string { + return pos[0 : len(pos)-4] +} + +// rowNumberFromPosition gets the row number from the position +func rowNumberFromPosition(pos string) int { + rowNumStr := pos[len(pos)-4:] + rowNum, _ := strconv.Atoi(rowNumStr) + return rowNum +} diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 43d1b31e0f2..0ac4129fd8b 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -68,14 +68,18 @@ func TestDownPrimary(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) - // Make the rdonly tablet unavailable - err := rdonly.MysqlctlProcess.Stop() + // Make the rdonly vttablet unavailable + err := rdonly.VttabletProcess.TearDown() + require.NoError(t, err) + err = rdonly.MysqlctlProcess.Stop() + require.NoError(t, err) + // Make the current primary vttablet unavailable. + err = curPrimary.VttabletProcess.TearDown() require.NoError(t, err) - // Make the current primary database unavailable. err = curPrimary.MysqlctlProcess.Stop() require.NoError(t, err) defer func() { - // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests + // we remove the tablet from our global list utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) utils.PermanentlyRemoveVttablet(clusterInfo, rdonly) }() diff --git a/go/vt/topo/locks.go b/go/vt/topo/locks.go index 5ee60c2ea2f..a9e71a6e7d6 100644 --- a/go/vt/topo/locks.go +++ b/go/vt/topo/locks.go @@ -27,6 +27,7 @@ import ( "github.com/spf13/pflag" + _flag "vitess.io/vitess/go/internal/flag" "vitess.io/vitess/go/trace" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/proto/vtrpc" @@ -38,15 +39,14 @@ import ( // keyspaces and shards. var ( - // DefaultLockTimeout is a good value to use as a default for - // locking a shard / keyspace. - // Now used only for unlock operations - defaultLockTimeout = 30 * time.Second + // LockTimeout is the maximum duration for which a + // shard / keyspace lock can be acquired for. + LockTimeout = 45 * time.Second // RemoteOperationTimeout is used for operations where we have to // call out to another process. // Used for RPC calls (including topo server calls) - RemoteOperationTimeout = 30 * time.Second + RemoteOperationTimeout = 15 * time.Second ) // Lock describes a long-running lock on a keyspace or a shard. @@ -70,6 +70,7 @@ func init() { func registerTopoLockFlags(fs *pflag.FlagSet) { fs.DurationVar(&RemoteOperationTimeout, "remote_operation_timeout", RemoteOperationTimeout, "time to wait for a remote operation") + fs.DurationVar(&LockTimeout, "lock-timeout", LockTimeout, "Maximum time for which a shard/keyspace lock can be acquired for") } // newLock creates a new Lock. @@ -244,7 +245,7 @@ func CheckKeyspaceLockedAndRenew(ctx context.Context, keyspace string) error { func (l *Lock) lockKeyspace(ctx context.Context, ts *Server, keyspace string) (LockDescriptor, error) { log.Infof("Locking keyspace %v for action %v", keyspace, l.Action) - ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout) + ctx, cancel := context.WithTimeout(ctx, getLockTimeout()) defer cancel() span, ctx := trace.NewSpan(ctx, "TopoServer.LockKeyspaceForAction") @@ -265,10 +266,8 @@ func (l *Lock) unlockKeyspace(ctx context.Context, ts *Server, keyspace string, // Detach from the parent timeout, but copy the trace span. // We need to still release the lock even if the parent // context timed out. - // Note that we are not using the user provided RemoteOperationTimeout - // here because it is possible that that timeout is too short. ctx = trace.CopySpan(context.TODO(), ctx) - ctx, cancel := context.WithTimeout(ctx, defaultLockTimeout) + ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout) defer cancel() span, ctx := trace.NewSpan(ctx, "TopoServer.UnlockKeyspaceForAction") @@ -385,7 +384,7 @@ func CheckShardLocked(ctx context.Context, keyspace, shard string) error { func (l *Lock) lockShard(ctx context.Context, ts *Server, keyspace, shard string) (LockDescriptor, error) { log.Infof("Locking shard %v/%v for action %v", keyspace, shard, l.Action) - ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout) + ctx, cancel := context.WithTimeout(ctx, getLockTimeout()) defer cancel() span, ctx := trace.NewSpan(ctx, "TopoServer.LockShardForAction") @@ -406,10 +405,8 @@ func (l *Lock) lockShard(ctx context.Context, ts *Server, keyspace, shard string func (l *Lock) unlockShard(ctx context.Context, ts *Server, keyspace, shard string, lockDescriptor LockDescriptor, actionError error) error { // Detach from the parent timeout, but copy the trace span. // We need to still release the lock even if the parent context timed out. - // Note that we are not using the user provided RemoteOperationTimeout - // here because it is possible that that timeout is too short. ctx = trace.CopySpan(context.TODO(), ctx) - ctx, cancel := context.WithTimeout(ctx, defaultLockTimeout) + ctx, cancel := context.WithTimeout(ctx, RemoteOperationTimeout) defer cancel() span, ctx := trace.NewSpan(ctx, "TopoServer.UnlockShardForAction") @@ -428,3 +425,15 @@ func (l *Lock) unlockShard(ctx context.Context, ts *Server, keyspace, shard stri } return lockDescriptor.Unlock(ctx) } + +// getLockTimeout is shim code used for backward compatibility with v15 +// This code can be removed in v17+ and LockTimeout can be used directly +func getLockTimeout() time.Duration { + if _flag.IsFlagProvided("lock-timeout") { + return LockTimeout + } + if _flag.IsFlagProvided("remote_operation_timeout") { + return RemoteOperationTimeout + } + return LockTimeout +} diff --git a/go/vt/topo/locks_test.go b/go/vt/topo/locks_test.go new file mode 100644 index 00000000000..da4f179f83c --- /dev/null +++ b/go/vt/topo/locks_test.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topo + +import ( + "os" + "testing" + "time" + + "github.com/spf13/pflag" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/internal/flag" +) + +// TestGetLockTimeout tests the behaviour of +// getLockTimeout function in different situations where +// the two flags `remote_operation_timeout` and `lock-timeout` are +// provided or not. +func TestGetLockTimeout(t *testing.T) { + tests := []struct { + description string + lockTimeoutValue string + remoteOperationTimeoutValue string + expectedLockTimeout time.Duration + }{ + { + description: "no flags specified", + lockTimeoutValue: "", + remoteOperationTimeoutValue: "", + expectedLockTimeout: 45 * time.Second, + }, { + description: "lock-timeout flag specified", + lockTimeoutValue: "33s", + remoteOperationTimeoutValue: "", + expectedLockTimeout: 33 * time.Second, + }, { + description: "remote operation timeout flag specified", + lockTimeoutValue: "", + remoteOperationTimeoutValue: "33s", + expectedLockTimeout: 33 * time.Second, + }, { + description: "both flags specified", + lockTimeoutValue: "33s", + remoteOperationTimeoutValue: "22s", + expectedLockTimeout: 33 * time.Second, + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + var args []string + if tt.lockTimeoutValue != "" { + args = append(args, "--lock-timeout", tt.lockTimeoutValue) + } + if tt.remoteOperationTimeoutValue != "" { + args = append(args, "--remote_operation_timeout", tt.remoteOperationTimeoutValue) + } + os.Args = os.Args[0:1] + os.Args = append(os.Args, args...) + + fs := pflag.NewFlagSet("test", pflag.ExitOnError) + registerTopoLockFlags(fs) + flag.Parse(fs) + + val := getLockTimeout() + require.Equal(t, tt.expectedLockTimeout, val) + }) + } + +} diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index ba846ebc147..9e4ac550a8f 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -192,7 +192,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve } // Stop replication on all the tablets and build their status map - stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, opts.WaitReplicasTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger) + stoppedReplicationSnapshot, err = stopReplicationAndBuildStatusMaps(ctx, erp.tmc, ev, tabletMap, topo.RemoteOperationTimeout, opts.IgnoreReplicas, opts.NewPrimaryAlias, opts.durability, erp.logger) if err != nil { return vterrors.Wrapf(err, "failed to stop replication and build status maps: %v", err) } diff --git a/go/vt/vtctl/reparentutil/replication.go b/go/vt/vtctl/reparentutil/replication.go index b1510ffaf09..512b3a60221 100644 --- a/go/vt/vtctl/reparentutil/replication.go +++ b/go/vt/vtctl/reparentutil/replication.go @@ -214,7 +214,7 @@ func stopReplicationAndBuildStatusMaps( tmc tmclient.TabletManagerClient, ev *events.Reparent, tabletMap map[string]*topo.TabletInfo, - waitReplicasTimeout time.Duration, + stopReplicationTimeout time.Duration, ignoredTablets sets.String, tabletToWaitFor *topodatapb.TabletAlias, durability Durabler, @@ -233,7 +233,7 @@ func stopReplicationAndBuildStatusMaps( } ) - groupCtx, groupCancel := context.WithTimeout(ctx, waitReplicasTimeout) + groupCtx, groupCancel := context.WithTimeout(ctx, stopReplicationTimeout) defer groupCancel() fillStatus := func(alias string, tabletInfo *topo.TabletInfo, mustWaitForTablet bool) { diff --git a/go/vt/vtctl/reparentutil/replication_test.go b/go/vt/vtctl/reparentutil/replication_test.go index 42b01cac770..01f043ac827 100644 --- a/go/vt/vtctl/reparentutil/replication_test.go +++ b/go/vt/vtctl/reparentutil/replication_test.go @@ -18,27 +18,33 @@ package reparentutil import ( "context" + "os" "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "vitess.io/vitess/go/vt/vterrors" - - "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/util/sets" + _flag "vitess.io/vitess/go/internal/flag" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/topotools/events" + "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) +func TestMain(m *testing.M) { + _flag.ParseFlagsForTest() + os.Exit(m.Run()) +} + func TestFindValidEmergencyReparentCandidates(t *testing.T) { t.Parallel() @@ -278,7 +284,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) { durability string tmc *stopReplicationAndBuildStatusMapsTestTMClient tabletMap map[string]*topo.TabletInfo - waitReplicasTimeout time.Duration + stopReplicasTimeout time.Duration ignoredTablets sets.String tabletToWaitFor *topodatapb.TabletAlias expectedStatusMap map[string]*replicationdatapb.StopReplicationStatus @@ -796,7 +802,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) { shouldErr: true, // we get multiple errors, so we fail }, { - name: "waitReplicasTimeout exceeded", + name: "stopReplicasTimeout exceeded", durability: "none", tmc: &stopReplicationAndBuildStatusMapsTestTMClient{ stopReplicationAndGetStatusDelays: map[string]time.Duration{ @@ -840,7 +846,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) { }, }, }, - waitReplicasTimeout: time.Millisecond * 5, + stopReplicasTimeout: time.Millisecond * 5, ignoredTablets: sets.NewString(), expectedStatusMap: map[string]*replicationdatapb.StopReplicationStatus{ "zone1-0000000101": { @@ -1098,7 +1104,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) { Uid: 102, }, }}, - waitReplicasTimeout: time.Minute, + stopReplicasTimeout: time.Minute, expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{}, shouldErr: false, }, @@ -1110,7 +1116,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) { t.Run(tt.name, func(t *testing.T) { durability, err := GetDurabilityPolicy(tt.durability) require.NoError(t, err) - res, err := stopReplicationAndBuildStatusMaps(ctx, tt.tmc, &events.Reparent{}, tt.tabletMap, tt.waitReplicasTimeout, tt.ignoredTablets, tt.tabletToWaitFor, durability, logger) + res, err := stopReplicationAndBuildStatusMaps(ctx, tt.tmc, &events.Reparent{}, tt.tabletMap, tt.stopReplicasTimeout, tt.ignoredTablets, tt.tabletToWaitFor, durability, logger) if tt.shouldErr { assert.Error(t, err) return diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 0c5cadd2431..fd54e9ed582 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -64,7 +64,6 @@ var ( auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days recoveryPeriodBlockDuration = 30 * time.Second preventCrossCellFailover = false - lockShardTimeout = 30 * time.Second waitReplicasTimeout = 30 * time.Second topoInformationRefreshDuration = 15 * time.Second recoveryPollDuration = 1 * time.Second @@ -82,7 +81,8 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") - fs.DurationVar(&lockShardTimeout, "lock-shard-timeout", lockShardTimeout, "Duration for which a shard lock is held when running a recovery") + fs.Duration("lock-shard-timeout", 30*time.Second, "Duration for which a shard lock is held when running a recovery") + _ = fs.MarkDeprecated("lock-shard-timeout", "Please use lock-timeout instead.") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") @@ -103,8 +103,7 @@ type Configuration struct { AuditPurgeDays uint // Days after which audit entries are purged from the database RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all. - LockShardTimeoutSeconds int // Timeout on context used to lock shard. Should be a small value because we should fail-fast - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockShardTimeoutSeconds since that is the total time we use for an ERS. + WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } @@ -133,7 +132,6 @@ func UpdateConfigValuesFromFlags() { Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second) Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover - Config.LockShardTimeoutSeconds = int(lockShardTimeout / time.Second) Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) @@ -157,7 +155,6 @@ func newConfiguration() *Configuration { AuditPurgeDays: 7, RecoveryPeriodBlockSeconds: 30, PreventCrossDataCenterPrimaryFailover: false, - LockShardTimeoutSeconds: 30, WaitReplicasTimeoutSeconds: 30, TopoInformationRefreshSeconds: 15, RecoveryPollSeconds: 1, diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 90e78d56623..2009b476f1d 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -187,21 +187,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override lockShardTimeout", func(t *testing.T) { - oldLockShardTimeout := lockShardTimeout - lockShardTimeout = 3 * time.Hour - // Restore the changes we make - defer func() { - Config = newConfiguration() - lockShardTimeout = oldLockShardTimeout - }() - - testConfig := newConfiguration() - testConfig.LockShardTimeoutSeconds = 10800 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override waitReplicasTimeout", func(t *testing.T) { oldWaitReplicasTimeout := waitReplicasTimeout waitReplicasTimeout = 3*time.Minute + 4*time.Second diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index 9ace94af430..e6cf71f101c 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -202,20 +202,21 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an // Discover new tablets. // TODO(sougou): enhance this to work with multi-schema, // where each instanceKey can have multiple tablets. - latestInstances := make(map[inst.InstanceKey]bool) + latestInstances := make(map[string]bool) + var wg sync.WaitGroup for _, tabletInfo := range tablets { tablet := tabletInfo.Tablet - if tablet.MysqlHostname == "" { + if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) { continue } - if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) { + latestInstances[topoproto.TabletAliasString(tablet.Alias)] = true + if tablet.MysqlHostname == "" { continue } instanceKey := inst.InstanceKey{ Hostname: tablet.MysqlHostname, Port: int(tablet.MysqlPort), } - latestInstances[instanceKey] = true old, err := inst.ReadTablet(instanceKey) if err != nil && err != inst.ErrTabletAliasNil { log.Error(err) @@ -228,9 +229,14 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an log.Error(err) continue } - loader(&instanceKey) + wg.Add(1) + go func() { + defer wg.Done() + loader(&instanceKey) + }() log.Infof("Discovered: %v", tablet) } + wg.Wait() // Forget tablets that were removed. toForget := make(map[inst.InstanceKey]*topodatapb.Tablet) @@ -239,12 +245,12 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an Hostname: row.GetString("hostname"), Port: row.GetInt("port"), } - if !latestInstances[curKey] { - tablet := &topodatapb.Tablet{} - if err := prototext.Unmarshal([]byte(row.GetString("info")), tablet); err != nil { - log.Error(err) - return nil - } + tablet := &topodatapb.Tablet{} + if err := prototext.Unmarshal([]byte(row.GetString("info")), tablet); err != nil { + log.Error(err) + return nil + } + if !latestInstances[topoproto.TabletAliasString(tablet.Alias)] { toForget[curKey] = tablet } return nil @@ -285,18 +291,16 @@ func LockShard(ctx context.Context, instanceKey inst.InstanceKey) (context.Conte if err != nil { return nil, nil, err } - ctx, cancel := context.WithTimeout(ctx, time.Duration(config.Config.LockShardTimeoutSeconds)*time.Second) + atomic.AddInt32(&shardsLockCounter, 1) ctx, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery") if err != nil { - cancel() atomic.AddInt32(&shardsLockCounter, -1) return nil, nil, err } return ctx, func(e *error) { defer atomic.AddInt32(&shardsLockCounter, -1) unlock(e) - cancel() }, nil } diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index 64262eff250..410f1a70e0a 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -18,6 +18,7 @@ package logic import ( "context" + "sync/atomic" "testing" "github.com/google/go-cmp/cmp" @@ -137,10 +138,33 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) { verifyRefreshTabletsInKeyspaceShard(t, true, 3, tablets) }) + t.Run("tablet shutdown removes mysql hostname and port. We shouldn't forget the tablet", func(t *testing.T) { + defer func() { + _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { + tablet.MysqlHostname = hostname + tablet.MysqlPort = 100 + return nil + }) + }() + // Let's assume tab100 shutdown. This would clear its tablet hostname and port + _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { + tablet.MysqlHostname = "" + tablet.MysqlPort = 0 + return nil + }) + require.NoError(t, err) + // We expect no tablets to be refreshed. Also, tab100 shouldn't be forgotten + verifyRefreshTabletsInKeyspaceShard(t, false, 0, tablets) + }) + t.Run("change a tablet and call refreshTabletsInKeyspaceShard again", func(t *testing.T) { startTimeInitially := tab100.PrimaryTermStartTime.Seconds defer func() { tab100.PrimaryTermStartTime.Seconds = startTimeInitially + _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { + tablet.PrimaryTermStartTime.Seconds = startTimeInitially + return nil + }) }() tab100.PrimaryTermStartTime.Seconds = 1000 _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { @@ -224,17 +248,18 @@ func TestShardPrimary(t *testing.T) { // verifyRefreshTabletsInKeyspaceShard calls refreshTabletsInKeyspaceShard with the forceRefresh parameter provided and verifies that // the number of instances refreshed matches the parameter and all the tablets match the ones provided func verifyRefreshTabletsInKeyspaceShard(t *testing.T, forceRefresh bool, instanceRefreshRequired int, tablets []*topodatapb.Tablet) { - instancesRefreshed := 0 + var instancesRefreshed atomic.Int32 + instancesRefreshed.Store(0) // call refreshTabletsInKeyspaceShard while counting all the instances that are refreshed refreshTabletsInKeyspaceShard(context.Background(), keyspace, shard, func(instanceKey *inst.InstanceKey) { - instancesRefreshed++ + instancesRefreshed.Add(1) }, forceRefresh) // Verify that all the tablets are present in the database for _, tablet := range tablets { verifyTabletInfo(t, tablet, "") } // Verify that refresh as many tablets as expected - assert.EqualValues(t, instanceRefreshRequired, instancesRefreshed) + assert.EqualValues(t, instanceRefreshRequired, instancesRefreshed.Load()) } // verifyTabletInfo verifies that the tablet information read from the vtorc database diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 001d702cfde..db77cac42ca 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -292,6 +292,8 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly log.Warningf("ERS - %s", value) case logutilpb.Level_ERROR: log.Errorf("ERS - %s", value) + default: + log.Infof("ERS - %s", value) } _ = AuditTopologyRecovery(topologyRecovery, value) })).ReparentShard(ctx, @@ -304,6 +306,9 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover, }, ) + if err != nil { + log.Errorf("Error running ERS - %v", err) + } if ev != nil && ev.NewPrimary != nil { promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ From 75c3a799f1b2ecaf2ae40f52d08e15b71126ded2 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Sat, 10 Dec 2022 17:12:52 +0530 Subject: [PATCH 3/9] log: also log error in DiscoverInstance when force discovery is specified (#11936) Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/vt/vtorc/logic/orchestrator.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/vt/vtorc/logic/orchestrator.go b/go/vt/vtorc/logic/orchestrator.go index ffc5b789aeb..77e0fd30993 100644 --- a/go/vt/vtorc/logic/orchestrator.go +++ b/go/vt/vtorc/logic/orchestrator.go @@ -234,6 +234,10 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { backendLatency := latency.Elapsed("backend") instanceLatency := latency.Elapsed("instance") + if forceDiscovery { + log.Infof("Force discovered - %+v, err - %v", instance, err) + } + if instance == nil { failedDiscoveriesCounter.Inc(1) _ = discoveryMetrics.Append(&discovery.Metric{ @@ -255,10 +259,6 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { return } - if forceDiscovery { - log.Infof("Force discovered - %+v", instance) - } - _ = discoveryMetrics.Append(&discovery.Metric{ Timestamp: time.Now(), InstanceKey: instanceKey, From 3dde42a6aa33fbbe1e8571ae4024308b72de4e77 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Thu, 5 Jan 2023 14:44:29 +0530 Subject: [PATCH 4/9] VTOrc Code Cleanup - generate_base, replace cluster_name with keyspace and shard. (#12012) * feat: refactor generate commands of VTOrc to be in a single file Signed-off-by: Manan Gupta * refactor: cleanup create table formatting Signed-off-by: Manan Gupta * feat: cleanup the usage of IsSQLite and IsMySQL Signed-off-by: Manan Gupta * feat: remove unused minimal instance Signed-off-by: Manan Gupta * feat: remove unused table cluster_domain_name Signed-off-by: Manan Gupta * feat: fix vtorc database to store keyspace and shard instead of cluster Signed-off-by: Manan Gupta * feat: remove unused attributes Signed-off-by: Manan Gupta * feat: remove unused cluster domain Signed-off-by: Manan Gupta * feat: change GetClusterName to GetKeyspaceAndShardName Signed-off-by: Manan Gupta * feat: fix insertion into database_instance Signed-off-by: Manan Gupta * feat: fix SnapshotTopologies Signed-off-by: Manan Gupta * feat: remove inject unseen primary and inject seed Signed-off-by: Manan Gupta * feat: remove ClusterName from Instance Signed-off-by: Manan Gupta * feat: fix Audit operations Signed-off-by: Manan Gupta * feat: add Keyspace and Shard to cluster information to replace ClusterName Signed-off-by: Manan Gupta * feat: fix attempt failure detection registeration Signed-off-by: Manan Gupta * feat: fix blocked topology recoveries Signed-off-by: Manan Gupta * feat: fix topology recovery Signed-off-by: Manan Gupta * feat: reading recovery instances Signed-off-by: Manan Gupta * feat: fix get replication and analysis Signed-off-by: Manan Gupta * feat: fix bug in query Signed-off-by: Manan Gupta * test: add tests to check that filtering by keyspace works for APIs Signed-off-by: Manan Gupta * feat: remove remaining usages of ClusterName Signed-off-by: Manan Gupta * refactor: fix comment explaining sleep in the test Signed-off-by: Manan Gupta * feat: add code to prevent filtering just by shard and add tests for it Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/cmd/vtorc/status.go | 2 +- go/test/endtoend/vtorc/api/api_test.go | 20 + .../vtorc/readtopologyinstance/main_test.go | 2 - go/vt/vtorc/attributes/attributes.go | 26 - go/vt/vtorc/attributes/attributes_dao.go | 109 -- go/vt/vtorc/config/config.go | 12 +- go/vt/vtorc/db/db.go | 71 +- go/vt/vtorc/db/generate_base.go | 1294 +++++++++-------- go/vt/vtorc/db/generate_patches.go | 559 ------- go/vt/vtorc/inst/analysis_dao.go | 36 +- go/vt/vtorc/inst/analysis_dao_test.go | 2 +- go/vt/vtorc/inst/audit_dao.go | 16 +- go/vt/vtorc/inst/audit_dao_test.go | 117 ++ go/vt/vtorc/inst/cluster.go | 4 +- go/vt/vtorc/inst/cluster_domain_dao.go | 60 - go/vt/vtorc/inst/downtime_dao.go | 2 +- go/vt/vtorc/inst/instance.go | 1 - go/vt/vtorc/inst/instance_dao.go | 87 +- go/vt/vtorc/inst/instance_dao_test.go | 64 +- go/vt/vtorc/inst/minimal_instance.go | 15 - go/vt/vtorc/logic/orchestrator.go | 2 - go/vt/vtorc/logic/tablet_discovery.go | 3 - go/vt/vtorc/logic/topology_recovery.go | 12 +- go/vt/vtorc/logic/topology_recovery_dao.go | 46 +- .../vtorc/logic/topology_recovery_dao_test.go | 70 + go/vt/vtorc/server/api.go | 28 +- go/vt/vtorc/test/recovery_analysis.go | 6 - 27 files changed, 1010 insertions(+), 1656 deletions(-) delete mode 100644 go/vt/vtorc/attributes/attributes.go delete mode 100644 go/vt/vtorc/attributes/attributes_dao.go delete mode 100644 go/vt/vtorc/db/generate_patches.go create mode 100644 go/vt/vtorc/inst/audit_dao_test.go delete mode 100644 go/vt/vtorc/inst/cluster_domain_dao.go delete mode 100644 go/vt/vtorc/inst/minimal_instance.go create mode 100644 go/vt/vtorc/logic/topology_recovery_dao_test.go diff --git a/go/cmd/vtorc/status.go b/go/cmd/vtorc/status.go index bdb54963051..a4d8a59d3fc 100644 --- a/go/cmd/vtorc/status.go +++ b/go/cmd/vtorc/status.go @@ -24,7 +24,7 @@ import ( // addStatusParts adds UI parts to the /debug/status page of VTOrc func addStatusParts() { servenv.AddStatusPart("Recent Recoveries", logic.TopologyRecoveriesTemplate, func() any { - recoveries, _ := logic.ReadRecentRecoveries("", false, 0) + recoveries, _ := logic.ReadRecentRecoveries(false, 0) return recoveries }) } diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 87312004f7a..4885a67aa9c 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -107,10 +107,20 @@ func TestProblemsAPI(t *testing.T) { assert.Equal(t, 200, status, resp) assert.Contains(t, resp, fmt.Sprintf(`"Port": %d`, replica.MySQLPort)) + // Verify that filtering by keyspace also works in the API as intended + status, resp = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks") + assert.Equal(t, 200, status, resp) + assert.Contains(t, resp, fmt.Sprintf(`"Port": %d`, replica.MySQLPort)) + // Check that filtering using keyspace and shard works status, resp = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks&shard=80-") assert.Equal(t, 200, status, resp) assert.Equal(t, "[]", resp) + + // Check that filtering using just the shard fails + status, resp = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?shard=0") + assert.Equal(t, 400, status, resp) + assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp) }) t.Run("Enable Recoveries API", func(t *testing.T) { @@ -150,9 +160,19 @@ func TestProblemsAPI(t *testing.T) { assert.Equal(t, 200, status, resp) assert.Contains(t, resp, fmt.Sprintf(`"InstanceAlias": "%v"`, replica.Alias)) + // Check that filtering using keyspace works + status, resp = utils.MakeAPICall(t, vtorc, "/api/problems?keyspace=ks") + assert.Equal(t, 200, status, resp) + assert.Contains(t, resp, fmt.Sprintf(`"InstanceAlias": "%v"`, replica.Alias)) + // Check that filtering using keyspace and shard works status, resp = utils.MakeAPICall(t, vtorc, "/api/problems?keyspace=ks&shard=80-") assert.Equal(t, 200, status, resp) assert.Equal(t, "null", resp) + + // Check that filtering using just the shard fails + status, resp = utils.MakeAPICall(t, vtorc, "/api/problems?shard=0") + assert.Equal(t, 400, status, resp) + assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp) }) } diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index 63ef7f042ad..75ecbfd592c 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -104,7 +104,6 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.False(t, primaryInstance.HasReplicationCredentials) assert.Equal(t, primaryInstance.ReplicationIOThreadState, inst.ReplicationThreadStateNoThread) assert.Equal(t, primaryInstance.ReplicationSQLThreadState, inst.ReplicationThreadStateNoThread) - assert.Equal(t, fmt.Sprintf("%v:%v", keyspace.Name, shard0.Name), primaryInstance.ClusterName) // insert an errant GTID in the replica _, err = utils.RunSQL(t, "insert into vt_insert_test(id, msg) values (10173, 'test 178342')", replica, "vt_ks") @@ -160,5 +159,4 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.False(t, replicaInstance.HasReplicationFilters) assert.LessOrEqual(t, int(replicaInstance.SecondsBehindPrimary.Int64), 1) assert.False(t, replicaInstance.AllowTLS) - assert.Equal(t, fmt.Sprintf("%v:%v", keyspace.Name, shard0.Name), replicaInstance.ClusterName) } diff --git a/go/vt/vtorc/attributes/attributes.go b/go/vt/vtorc/attributes/attributes.go deleted file mode 100644 index 466f57c93d7..00000000000 --- a/go/vt/vtorc/attributes/attributes.go +++ /dev/null @@ -1,26 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package attributes - -// HostAttributes presents attributes submitted by a host -type HostAttributes struct { - Hostname string - AttributeName string - AttributeValue string - SubmitTimestamp string - ExpireTimestamp string -} diff --git a/go/vt/vtorc/attributes/attributes_dao.go b/go/vt/vtorc/attributes/attributes_dao.go deleted file mode 100644 index 9e07ac96eaf..00000000000 --- a/go/vt/vtorc/attributes/attributes_dao.go +++ /dev/null @@ -1,109 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package attributes - -import ( - "fmt" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// SetHostAttributes is used to set host attributes -func SetHostAttributes(hostname string, attributeName string, attributeValue string) error { - _, err := db.ExecVTOrc(` - replace - into host_attributes ( - hostname, attribute_name, attribute_value, submit_timestamp, expire_timestamp - ) VALUES ( - ?, ?, ?, NOW(), NULL - ) - `, - hostname, - attributeName, - attributeValue, - ) - if err != nil { - log.Error(err) - return err - } - - return err -} - -func getHostAttributesByClause(whereClause string, args []any) ([]HostAttributes, error) { - var res []HostAttributes - query := fmt.Sprintf(` - select - hostname, - attribute_name, - attribute_value, - submit_timestamp , - ifnull(expire_timestamp, '') as expire_timestamp - from - host_attributes - %s - order by - hostname, attribute_name - `, whereClause) - - err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - hostAttributes := HostAttributes{} - hostAttributes.Hostname = m.GetString("hostname") - hostAttributes.AttributeName = m.GetString("attribute_name") - hostAttributes.AttributeValue = m.GetString("attribute_value") - hostAttributes.SubmitTimestamp = m.GetString("submit_timestamp") - hostAttributes.ExpireTimestamp = m.GetString("expire_timestamp") - - res = append(res, hostAttributes) - return nil - }) - - if err != nil { - log.Error(err) - } - return res, err -} - -// GetHostAttribute expects to return a single attribute for a given hostname/attribute-name combination -// or error on empty result -func GetHostAttribute(hostname string, attributeName string) (string, error) { - whereClause := `where hostname=? and attribute_name=?` - attributes, err := getHostAttributesByClause(whereClause, sqlutils.Args(hostname, attributeName)) - if err != nil { - return "", err - } - if len(attributeName) == 0 { - log.Errorf("No attribute found for %+v, %+v", hostname, attributeName) - return "", fmt.Errorf("No attribute found for %+v, %+v", hostname, attributeName) - } - return attributes[0].AttributeValue, nil -} - -// SetGeneralAttribute sets an attribute not associated with a host. Its a key-value thing -func SetGeneralAttribute(attributeName string, attributeValue string) error { - if attributeName == "" { - return nil - } - return SetHostAttributes("*", attributeName, attributeValue) -} - -// GetGeneralAttribute expects to return a single attribute value (not associated with a specific hostname) -func GetGeneralAttribute(attributeName string) (result string, err error) { - return GetHostAttribute("*", attributeName) -} diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index fd54e9ed582..3d3dde96034 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -162,23 +162,13 @@ func newConfiguration() *Configuration { } func (config *Configuration) postReadAdjustments() error { - if config.IsSQLite() && config.SQLite3DataFile == "" { + if config.SQLite3DataFile == "" { return fmt.Errorf("SQLite3DataFile must be set") } return nil } -// TODO: Simplify the callers and delete this function -func (config *Configuration) IsSQLite() bool { - return true -} - -// TODO: Simplify the callers and delete this function -func (config *Configuration) IsMySQL() bool { - return false -} - // read reads configuration from given file, or silently skips if the file does not exist. // If the file does exist, then it is expected to be in valid JSON format or the function bails out. func read(fileName string) (*Configuration, error) { diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go index 2d11f7f2332..04150339c5c 100644 --- a/go/vt/vtorc/db/db.go +++ b/go/vt/vtorc/db/db.go @@ -53,10 +53,6 @@ func (dummyRes DummySQLResult) RowsAffected() (int64, error) { return 1, nil } -func IsSQLite() bool { - return config.Config.IsSQLite() -} - // OpenTopology returns the DB instance for the vtorc backed database func OpenVTOrc() (db *sql.DB, err error) { var fromCache bool @@ -72,11 +68,8 @@ func OpenVTOrc() (db *sql.DB, err error) { return db, err } -func translateStatement(statement string) (string, error) { - if IsSQLite() { - statement = sqlutils.ToSqlite3Dialect(statement) - } - return statement, nil +func translateStatement(statement string) string { + return sqlutils.ToSqlite3Dialect(statement) } // registerVTOrcDeployment updates the vtorc_metadata table upon successful deployment @@ -101,30 +94,8 @@ func deployStatements(db *sql.DB, queries []string) error { if err != nil { log.Fatal(err.Error()) } - // Ugly workaround ahead. - // Origin of this workaround is the existence of some "timestamp NOT NULL," column definitions, - // where in NO_ZERO_IN_DATE,NO_ZERO_DATE sql_mode are invalid (since default is implicitly "0") - // This means installation of vtorc fails on such configured servers, and in particular on 5.7 - // where this setting is the dfault. - // For purpose of backwards compatability, what we do is force sql_mode to be more relaxed, create the schemas - // along with the "invalid" definition, and then go ahead and fix those definitions via following ALTER statements. - // My bad. - originalSQLMode := "" - if config.Config.IsMySQL() { - _ = tx.QueryRow(`select @@session.sql_mode`).Scan(&originalSQLMode) - if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', '')`); err != nil { - log.Fatal(err.Error()) - } - if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_IN_DATE', '')`); err != nil { - log.Fatal(err.Error()) - } - } for _, query := range queries { - query, err := translateStatement(query) - if err != nil { - log.Fatalf("Cannot initiate vtorc: %+v; query=%+v", err, query) - return err - } + query = translateStatement(query) if _, err := tx.Exec(query); err != nil { if strings.Contains(err.Error(), "syntax error") { log.Fatalf("Cannot initiate vtorc: %+v; query=%+v", err, query) @@ -143,11 +114,6 @@ func deployStatements(db *sql.DB, queries []string) error { } } } - if config.Config.IsMySQL() { - if _, err := tx.Exec(`set session sql_mode=?`, originalSQLMode); err != nil { - log.Fatal(err.Error()) - } - } if err := tx.Commit(); err != nil { log.Fatal(err.Error()) } @@ -168,14 +134,11 @@ func ClearVTOrcDatabase() { func initVTOrcDB(db *sql.DB) error { log.Info("Initializing vtorc") log.Info("Migrating database schema") - _ = deployStatements(db, generateSQLBase) - _ = deployStatements(db, generateSQLPatches) + _ = deployStatements(db, vtorcBackend) _ = registerVTOrcDeployment(db) - if IsSQLite() { - _, _ = ExecVTOrc(`PRAGMA journal_mode = WAL`) - _, _ = ExecVTOrc(`PRAGMA synchronous = NORMAL`) - } + _, _ = ExecVTOrc(`PRAGMA journal_mode = WAL`) + _, _ = ExecVTOrc(`PRAGMA synchronous = NORMAL`) return nil } @@ -183,10 +146,7 @@ func initVTOrcDB(db *sql.DB) error { // execInternal func execInternal(db *sql.DB, query string, args ...any) (sql.Result, error) { var err error - query, err = translateStatement(query) - if err != nil { - return nil, err - } + query = translateStatement(query) res, err := sqlutils.ExecNoPrepare(db, query, args...) return res, err } @@ -194,10 +154,7 @@ func execInternal(db *sql.DB, query string, args ...any) (sql.Result, error) { // ExecVTOrc will execute given query on the vtorc backend database. func ExecVTOrc(query string, args ...any) (sql.Result, error) { var err error - query, err = translateStatement(query) - if err != nil { - return nil, err - } + query = translateStatement(query) db, err := OpenVTOrc() if err != nil { return nil, err @@ -208,11 +165,7 @@ func ExecVTOrc(query string, args ...any) (sql.Result, error) { // QueryVTOrcRowsMap func QueryVTOrcRowsMap(query string, onRow func(sqlutils.RowMap) error) error { - query, err := translateStatement(query) - if err != nil { - log.Fatalf("Cannot query vtorc: %+v; query=%+v", err, query) - return err - } + query = translateStatement(query) db, err := OpenVTOrc() if err != nil { return err @@ -223,11 +176,7 @@ func QueryVTOrcRowsMap(query string, onRow func(sqlutils.RowMap) error) error { // QueryVTOrc func QueryVTOrc(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error { - query, err := translateStatement(query) - if err != nil { - log.Fatalf("Cannot query vtorc: %+v; query=%+v", err, query) - return err - } + query = translateStatement(query) db, err := OpenVTOrc() if err != nil { return err diff --git a/go/vt/vtorc/db/generate_base.go b/go/vt/vtorc/db/generate_base.go index 85aed1b2b86..3dd3cc437e4 100644 --- a/go/vt/vtorc/db/generate_base.go +++ b/go/vt/vtorc/db/generate_base.go @@ -16,803 +16,813 @@ package db -// generateSQLBase & generateSQLPatches are lists of SQL statements required to build the vtorc backend -var generateSQLBase = []string{ +// vtorcBackend is a list of SQL statements required to build the vtorc backend +var vtorcBackend = []string{ + ` +DROP TABLE IF EXISTS database_instance +`, + ` +CREATE TABLE database_instance ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + last_checked timestamp not null default (''), + last_seen timestamp NULL DEFAULT NULL, + server_id int NOT NULL, + version varchar(128) NOT NULL, + binlog_format varchar(16) NOT NULL, + log_bin tinyint NOT NULL, + log_replica_updates tinyint NOT NULL, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint NOT NULL, + source_host varchar(128) NOT NULL, + source_port smallint NOT NULL, + replica_sql_running tinyint NOT NULL, + replica_io_running tinyint NOT NULL, + source_log_file varchar(128) NOT NULL, + read_source_log_pos bigint NOT NULL, + relay_source_log_file varchar(128) NOT NULL, + exec_source_log_pos bigint NOT NULL, + replication_lag_seconds bigint DEFAULT NULL, + replica_lag_seconds bigint DEFAULT NULL, + read_only TINYint not null default 0, + last_sql_error TEXT not null default '', + last_io_error TEXT not null default '', + oracle_gtid TINYint not null default 0, + mariadb_gtid TINYint not null default 0, + relay_log_file varchar(128) not null default '', + relay_log_pos bigint not null default 0, + pseudo_gtid TINYint not null default 0, + replication_depth TINYint not null default 0, + has_replication_filters TINYint not null default 0, + data_center varchar(32) not null default '', + physical_environment varchar(32) not null default '', + is_co_primary TINYint not null default 0, + sql_delay int not null default 0, + binlog_server TINYint not null default 0, + supports_oracle_gtid TINYint not null default 0, + executed_gtid_set text not null default '', + server_uuid varchar(64) not null default '', + last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00', + gtid_purged text not null default '', + has_replication_credentials TINYint not null default 0, + allow_tls TINYint not null default 0, + semi_sync_enforced TINYint not null default 0, + instance_alias varchar(128) not null default '', + version_comment varchar(128) NOT NULL DEFAULT '', + major_version varchar(16) not null default '', + binlog_row_image varchar(16) not null default '', + last_discovery_latency bigint not null default 0, + semi_sync_primary_enabled TINYint not null default 0, + semi_sync_replica_enabled TINYint not null default 0, + gtid_mode varchar(32) not null default '', + last_check_partial_success tinyint not null default 0, + source_uuid varchar(64) not null default '', + gtid_errant text not null default '', + ancestry_uuid text not null default '', + replication_sql_thread_state tinyint signed not null default 0, + replication_io_thread_state tinyint signed not null default 0, + region varchar(32) not null default '', + semi_sync_primary_timeout int NOT NULL DEFAULT 0, + semi_sync_primary_wait_for_replica_count int NOT NULL DEFAULT 0, + semi_sync_primary_status TINYint NOT NULL DEFAULT 0, + semi_sync_replica_status TINYint NOT NULL DEFAULT 0, + semi_sync_primary_clients int NOT NULL DEFAULT 0, + replication_group_name VARCHAR(64) NOT NULL DEFAULT '', + replication_group_is_single_primary_mode TINYint NOT NULL DEFAULT 1, + replication_group_member_state VARCHAR(16) NOT NULL DEFAULT '', + replication_group_member_role VARCHAR(16) NOT NULL DEFAULT '', + replication_group_members text not null default '', + replication_group_primary_host varchar(128) NOT NULL DEFAULT '', + replication_group_primary_port smallint NOT NULL DEFAULT 0, + PRIMARY KEY (hostname,port) +)`, + ` +CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checked) + `, + ` +CREATE INDEX last_seen_idx_database_instance ON database_instance(last_seen) + `, + ` +DROP TABLE IF EXISTS database_instance_maintenance +`, + ` +CREATE TABLE database_instance_maintenance ( + database_instance_maintenance_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + maintenance_active tinyint(4) DEFAULT NULL, + begin_timestamp timestamp NULL DEFAULT NULL, + end_timestamp timestamp NULL DEFAULT NULL, + owner varchar(128) NOT NULL, + reason text NOT NULL, + processing_node_hostname varchar(128) not null default '', + processing_node_token varchar(128) not null default '', + explicitly_bounded TINYint not null default 0, + PRIMARY KEY (database_instance_maintenance_id) +)`, + ` +CREATE UNIQUE INDEX maintenance_uidx_database_instance_maintenance ON database_instance_maintenance (maintenance_active, hostname, port) + `, + ` +DROP TABLE IF EXISTS database_instance_long_running_queries +`, + ` +CREATE TABLE database_instance_long_running_queries ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + process_id bigint(20) NOT NULL, + process_started_at timestamp not null default (''), + process_user varchar(16) NOT NULL, + process_host varchar(128) NOT NULL, + process_db varchar(128) NOT NULL, + process_command varchar(16) NOT NULL, + process_time_seconds int(11) NOT NULL, + process_state varchar(128) NOT NULL, + process_info varchar(1024) NOT NULL, + PRIMARY KEY (hostname,port,process_id) +)`, + ` +CREATE INDEX process_started_at_idx_database_instance_long_running_queries ON database_instance_long_running_queries (process_started_at) + `, + ` +DROP TABLE IF EXISTS audit +`, + ` +CREATE TABLE audit ( + audit_id integer, + audit_timestamp timestamp not null default (''), + audit_type varchar(128) NOT NULL, + hostname varchar(128) NOT NULL DEFAULT '', + port smallint NOT NULL, + message text NOT NULL, + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + PRIMARY KEY (audit_id) +)`, ` - CREATE TABLE IF NOT EXISTS database_instance ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - last_checked timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_seen timestamp NULL DEFAULT NULL, - server_id int(10) unsigned NOT NULL, - version varchar(128) CHARACTER SET ascii NOT NULL, - binlog_format varchar(16) CHARACTER SET ascii NOT NULL, - log_bin tinyint(3) unsigned NOT NULL, - log_replica_updates tinyint(3) unsigned NOT NULL, - binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - source_host varchar(128) CHARACTER SET ascii NOT NULL, - source_port smallint(5) unsigned NOT NULL, - replica_sql_running tinyint(3) unsigned NOT NULL, - replica_io_running tinyint(3) unsigned NOT NULL, - source_log_file varchar(128) CHARACTER SET ascii NOT NULL, - read_source_log_pos bigint(20) unsigned NOT NULL, - relay_source_log_file varchar(128) CHARACTER SET ascii NOT NULL, - exec_source_log_pos bigint(20) unsigned NOT NULL, - replication_lag_seconds bigint(20) unsigned DEFAULT NULL, - replica_lag_seconds bigint(20) unsigned DEFAULT NULL, - cluster_name varchar(128) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (hostname,port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp) `, ` - DROP INDEX cluster_name_idx ON database_instance +CREATE INDEX host_port_idx_audit ON audit (hostname, port, audit_timestamp) `, ` - CREATE INDEX cluster_name_idx_database_instance ON database_instance(cluster_name) - `, +DROP TABLE IF EXISTS host_agent +`, ` - DROP INDEX last_checked_idx ON database_instance - `, +CREATE TABLE host_agent ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + token varchar(128) NOT NULL, + last_submitted timestamp not null default (''), + last_checked timestamp NULL DEFAULT NULL, + last_seen timestamp NULL DEFAULT NULL, + mysql_port smallint DEFAULT NULL, + count_mysql_snapshots smallint NOT NULL, + PRIMARY KEY (hostname) +)`, ` - CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checked) +CREATE INDEX token_idx_host_agent ON host_agent (token) `, ` - DROP INDEX last_seen_idx ON database_instance +CREATE INDEX last_submitted_idx_host_agent ON host_agent (last_submitted) `, ` - CREATE INDEX last_seen_idx_database_instance ON database_instance(last_seen) +CREATE INDEX last_checked_idx_host_agent ON host_agent (last_checked) `, ` - CREATE TABLE IF NOT EXISTS database_instance_maintenance ( - database_instance_maintenance_id int(10) unsigned NOT NULL AUTO_INCREMENT, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - maintenance_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) CHARACTER SET utf8 NOT NULL, - reason text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (database_instance_maintenance_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX last_seen_idx_host_agent ON host_agent (last_seen) `, ` - DROP INDEX maintenance_uidx ON database_instance_maintenance - `, +DROP TABLE IF EXISTS agent_seed +`, ` - CREATE UNIQUE INDEX maintenance_uidx_database_instance_maintenance ON database_instance_maintenance (maintenance_active, hostname, port) - `, +CREATE TABLE agent_seed ( + agent_seed_id integer, + target_hostname varchar(128) NOT NULL, + source_hostname varchar(128) NOT NULL, + start_timestamp timestamp not null default (''), + end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + is_complete tinyint NOT NULL DEFAULT '0', + is_successful tinyint NOT NULL DEFAULT '0', + PRIMARY KEY (agent_seed_id) +)`, + ` +CREATE INDEX target_hostname_idx_agent_seed ON agent_seed (target_hostname,is_complete) + `, + ` +CREATE INDEX source_hostname_idx_agent_seed ON agent_seed (source_hostname,is_complete) + `, ` - CREATE TABLE IF NOT EXISTS database_instance_long_running_queries ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - process_id bigint(20) NOT NULL, - process_started_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - process_user varchar(16) CHARACTER SET utf8 NOT NULL, - process_host varchar(128) CHARACTER SET utf8 NOT NULL, - process_db varchar(128) CHARACTER SET utf8 NOT NULL, - process_command varchar(16) CHARACTER SET utf8 NOT NULL, - process_time_seconds int(11) NOT NULL, - process_state varchar(128) CHARACTER SET utf8 NOT NULL, - process_info varchar(1024) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (hostname,port,process_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX start_timestamp_idx_agent_seed ON agent_seed (start_timestamp) `, ` - DROP INDEX process_started_at_idx ON database_instance_long_running_queries +CREATE INDEX is_complete_idx_agent_seed ON agent_seed (is_complete,start_timestamp) `, ` - CREATE INDEX process_started_at_idx_database_instance_long_running_queries ON database_instance_long_running_queries (process_started_at) +CREATE INDEX is_successful_idx_agent_seed ON agent_seed (is_successful, start_timestamp) `, ` - CREATE TABLE IF NOT EXISTS audit ( - audit_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, - audit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - audit_type varchar(128) CHARACTER SET ascii NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '', - port smallint(5) unsigned NOT NULL, - message text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (audit_id) - ) ENGINE=InnoDB DEFAULT CHARSET=latin1 - `, +DROP TABLE IF EXISTS agent_seed_state +`, ` - DROP INDEX audit_timestamp_idx ON audit - `, +CREATE TABLE agent_seed_state ( + agent_seed_state_id integer, + agent_seed_id int NOT NULL, + state_timestamp timestamp not null default (''), + state_action varchar(127) NOT NULL, + error_message varchar(255) NOT NULL, + PRIMARY KEY (agent_seed_state_id) +)`, ` - CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp) +CREATE INDEX agent_seed_idx_agent_seed_state ON agent_seed_state (agent_seed_id, state_timestamp) `, ` - DROP INDEX host_port_idx ON audit - `, +DROP TABLE IF EXISTS hostname_resolve +`, ` - CREATE INDEX host_port_idx_audit ON audit (hostname, port, audit_timestamp) - `, +CREATE TABLE hostname_resolve ( + hostname varchar(128) NOT NULL, + resolved_hostname varchar(128) NOT NULL, + resolved_timestamp timestamp not null default (''), + PRIMARY KEY (hostname) +)`, ` - CREATE TABLE IF NOT EXISTS host_agent ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - token varchar(128) NOT NULL, - last_submitted timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_checked timestamp NULL DEFAULT NULL, - last_seen timestamp NULL DEFAULT NULL, - mysql_port smallint(5) unsigned DEFAULT NULL, - count_mysql_snapshots smallint(5) unsigned NOT NULL, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX resolved_timestamp_idx_hostname_resolve ON hostname_resolve (resolved_timestamp) `, ` - DROP INDEX token_idx ON host_agent - `, +DROP TABLE IF EXISTS active_node +`, ` - CREATE INDEX token_idx_host_agent ON host_agent (token) - `, +CREATE TABLE active_node ( + anchor tinyint NOT NULL, + hostname varchar(128) NOT NULL, + token varchar(128) NOT NULL, + last_seen_active timestamp not null default (''), + first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (anchor) +)`, ` - DROP INDEX last_submitted_idx ON host_agent - `, +DROP TABLE IF EXISTS node_health +`, ` - CREATE INDEX last_submitted_idx_host_agent ON host_agent (last_submitted) - `, +CREATE TABLE node_health ( + hostname varchar(128) NOT NULL, + token varchar(128) NOT NULL, + last_seen_active timestamp not null default (''), + extra_info varchar(128) not null default '', + command varchar(128) not null default '', + app_version varchar(64) NOT NULL DEFAULT "", + first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + db_backend varchar(255) NOT NULL DEFAULT "", + incrementing_indicator bigint not null default 0, + PRIMARY KEY (hostname, token) +)`, ` - DROP INDEX last_checked_idx ON host_agent - `, +DROP TABLE IF EXISTS topology_recovery +`, ` - CREATE INDEX last_checked_idx_host_agent ON host_agent (last_checked) - `, +CREATE TABLE topology_recovery ( + recovery_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + in_active_period tinyint NOT NULL DEFAULT 0, + start_active_period timestamp not null default (''), + end_active_period_unixtime int, + end_recovery timestamp NULL DEFAULT NULL, + processing_node_hostname varchar(128) NOT NULL, + processcing_node_token varchar(128) NOT NULL, + successor_hostname varchar(128) DEFAULT NULL, + successor_port smallint DEFAULT NULL, + analysis varchar(128) not null default '', + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + count_affected_replicas int not null default 0, + is_successful TINYint NOT NULL DEFAULT 0, + acknowledged TINYint NOT NULL DEFAULT 0, + acknowledged_by varchar(128) not null default '', + acknowledge_comment text not null default '', + participating_instances text not null default '', + lost_replicas text not null default '', + all_errors text not null default '', + acknowledged_at TIMESTAMP NULL, + last_detection_id bigint not null default 0, + successor_alias varchar(128) DEFAULT NULL, + uid varchar(128) not null default '', + PRIMARY KEY (recovery_id) +)`, + ` +CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery (in_active_period, start_active_period) + `, + ` +CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period) + `, + ` +CREATE UNIQUE INDEX hostname_port_active_period_uidx_topology_recovery ON topology_recovery (hostname, port, in_active_period, end_active_period_unixtime) + `, + ` +DROP TABLE IF EXISTS hostname_unresolve +`, + ` +CREATE TABLE hostname_unresolve ( + hostname varchar(128) NOT NULL, + unresolved_hostname varchar(128) NOT NULL, + last_registered timestamp not null default (''), + PRIMARY KEY (hostname) +)`, + ` +CREATE INDEX unresolved_hostname_idx_hostname_unresolve ON hostname_unresolve (unresolved_hostname) + `, + ` +DROP TABLE IF EXISTS database_instance_topology_history +`, + ` +CREATE TABLE database_instance_topology_history ( + snapshot_unix_timestamp int NOT NULL, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + source_host varchar(128) NOT NULL, + source_port smallint NOT NULL, + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + version varchar(128) not null default '', + PRIMARY KEY (snapshot_unix_timestamp, hostname, port) +)`, + ` +CREATE INDEX keyspace_shard_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, keyspace, shard) + `, + ` +DROP TABLE IF EXISTS candidate_database_instance +`, + ` +CREATE TABLE candidate_database_instance ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + last_suggested timestamp not null default (''), + priority TINYINT SIGNED NOT NULL DEFAULT 1, + promotion_rule text check(promotion_rule in ('must', 'prefer', 'neutral', 'prefer_not', 'must_not')) NOT NULL DEFAULT 'neutral', + PRIMARY KEY (hostname, port) +)`, + ` +CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested) + `, + ` +DROP TABLE IF EXISTS database_instance_downtime +`, + ` +CREATE TABLE database_instance_downtime ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + downtime_active tinyint(4) DEFAULT NULL, + begin_timestamp timestamp default (''), + end_timestamp timestamp NULL DEFAULT NULL, + owner varchar(128) NOT NULL, + reason text NOT NULL, + PRIMARY KEY (hostname, port) +)`, + ` +DROP TABLE IF EXISTS topology_failure_detection +`, + ` +CREATE TABLE topology_failure_detection ( + detection_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + in_active_period tinyint NOT NULL DEFAULT '0', + start_active_period timestamp not null default (''), + end_active_period_unixtime int NOT NULL, + processing_node_hostname varchar(128) NOT NULL, + processcing_node_token varchar(128) NOT NULL, + analysis varchar(128) NOT NULL, + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + count_affected_replicas int NOT NULL, + is_actionable tinyint not null default 0, + PRIMARY KEY (detection_id) +)`, ` - DROP INDEX last_seen_idx ON host_agent +CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period) `, ` - CREATE INDEX last_seen_idx_host_agent ON host_agent (last_seen) - `, +DROP TABLE IF EXISTS hostname_resolve_history +`, ` - CREATE TABLE IF NOT EXISTS agent_seed ( - agent_seed_id int(10) unsigned NOT NULL AUTO_INCREMENT, - target_hostname varchar(128) NOT NULL, - source_hostname varchar(128) NOT NULL, - start_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - is_complete tinyint(3) unsigned NOT NULL DEFAULT '0', - is_successful tinyint(3) unsigned NOT NULL DEFAULT '0', - PRIMARY KEY (agent_seed_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE hostname_resolve_history ( + resolved_hostname varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + resolved_timestamp timestamp not null default (''), + PRIMARY KEY (resolved_hostname) +)`, ` - DROP INDEX target_hostname_idx ON agent_seed +CREATE INDEX hostname_idx_hostname_resolve_history ON hostname_resolve_history (hostname) `, ` - CREATE INDEX target_hostname_idx_agent_seed ON agent_seed (target_hostname,is_complete) +CREATE INDEX resolved_timestamp_idx_hostname_resolve_history ON hostname_resolve_history (resolved_timestamp) `, ` - DROP INDEX source_hostname_idx ON agent_seed - `, +DROP TABLE IF EXISTS hostname_unresolve_history +`, ` - CREATE INDEX source_hostname_idx_agent_seed ON agent_seed (source_hostname,is_complete) - `, +CREATE TABLE hostname_unresolve_history ( + unresolved_hostname varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + last_registered timestamp not null default (''), + PRIMARY KEY (unresolved_hostname) +)`, ` - DROP INDEX start_timestamp_idx ON agent_seed +CREATE INDEX hostname_idx_hostname_unresolve_history ON hostname_unresolve_history (hostname) `, ` - CREATE INDEX start_timestamp_idx_agent_seed ON agent_seed (start_timestamp) +CREATE INDEX last_registered_idx_hostname_unresolve_history ON hostname_unresolve_history (last_registered) `, ` - DROP INDEX is_complete_idx ON agent_seed - `, +DROP TABLE IF EXISTS primary_position_equivalence +`, ` - CREATE INDEX is_complete_idx_agent_seed ON agent_seed (is_complete,start_timestamp) - `, +CREATE TABLE primary_position_equivalence ( + equivalence_id integer, + primary1_hostname varchar(128) NOT NULL, + primary1_port smallint NOT NULL, + primary1_binary_log_file varchar(128) NOT NULL, + primary1_binary_log_pos bigint NOT NULL, + primary2_hostname varchar(128) NOT NULL, + primary2_port smallint NOT NULL, + primary2_binary_log_file varchar(128) NOT NULL, + primary2_binary_log_pos bigint NOT NULL, + last_suggested timestamp not null default (''), + PRIMARY KEY (equivalence_id) +)`, ` - DROP INDEX is_successful_idx ON agent_seed +CREATE UNIQUE INDEX equivalence_uidx_primary_position_equivalence ON primary_position_equivalence (primary1_hostname, primary1_port, primary1_binary_log_file, primary1_binary_log_pos, primary2_hostname, primary2_port) `, ` - CREATE INDEX is_successful_idx_agent_seed ON agent_seed (is_successful, start_timestamp) +CREATE INDEX primary2_idx_primary_position_equivalence ON primary_position_equivalence (primary2_hostname, primary2_port, primary2_binary_log_file, primary2_binary_log_pos) `, ` - CREATE TABLE IF NOT EXISTS agent_seed_state ( - agent_seed_state_id int(10) unsigned NOT NULL AUTO_INCREMENT, - agent_seed_id int(10) unsigned NOT NULL, - state_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - state_action varchar(127) NOT NULL, - error_message varchar(255) NOT NULL, - PRIMARY KEY (agent_seed_state_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX last_suggested_idx_primary_position_equivalence ON primary_position_equivalence (last_suggested) `, ` - DROP INDEX agent_seed_idx ON agent_seed_state - `, +DROP TABLE IF EXISTS async_request +`, ` - CREATE INDEX agent_seed_idx_agent_seed_state ON agent_seed_state (agent_seed_id, state_timestamp) - `, +CREATE TABLE async_request ( + request_id integer, + command varchar(128) not null, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + destination_hostname varchar(128) NOT NULL, + destination_port smallint NOT NULL, + pattern text NOT NULL, + gtid_hint varchar(32) not null, + begin_timestamp timestamp NULL DEFAULT NULL, + end_timestamp timestamp NULL DEFAULT NULL, + story text NOT NULL, + PRIMARY KEY (request_id) +)`, ` - CREATE TABLE IF NOT EXISTS host_attributes ( - hostname varchar(128) NOT NULL, - attribute_name varchar(128) NOT NULL, - attribute_value varchar(128) NOT NULL, - submit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - expire_timestamp timestamp NULL DEFAULT NULL, - PRIMARY KEY (hostname,attribute_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX begin_timestamp_idx_async_request ON async_request (begin_timestamp) `, ` - DROP INDEX attribute_name_idx ON host_attributes +CREATE INDEX end_timestamp_idx_async_request ON async_request (end_timestamp) `, ` - CREATE INDEX attribute_name_idx_host_attributes ON host_attributes (attribute_name) - `, +DROP TABLE IF EXISTS blocked_topology_recovery +`, ` - DROP INDEX attribute_value_idx ON host_attributes - `, +CREATE TABLE blocked_topology_recovery ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + analysis varchar(128) NOT NULL, + last_blocked_timestamp timestamp not null default (''), + blocking_recovery_id bigint, + PRIMARY KEY (hostname, port) +)`, ` - CREATE INDEX attribute_value_idx_host_attributes ON host_attributes (attribute_value) +CREATE INDEX keyspace_shard_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (keyspace, shard, last_blocked_timestamp) `, ` - DROP INDEX submit_timestamp_idx ON host_attributes - `, +DROP TABLE IF EXISTS database_instance_last_analysis +`, ` - CREATE INDEX submit_timestamp_idx_host_attributes ON host_attributes (submit_timestamp) - `, +CREATE TABLE database_instance_last_analysis ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + analysis_timestamp timestamp not null default (''), + analysis varchar(128) NOT NULL, + PRIMARY KEY (hostname, port) +)`, ` - DROP INDEX expire_timestamp_idx ON host_attributes +CREATE INDEX analysis_timestamp_idx_database_instance_last_analysis ON database_instance_last_analysis (analysis_timestamp) `, ` - CREATE INDEX expire_timestamp_idx_host_attributes ON host_attributes (expire_timestamp) - `, +DROP TABLE IF EXISTS database_instance_analysis_changelog +`, ` - CREATE TABLE IF NOT EXISTS hostname_resolve ( - hostname varchar(128) NOT NULL, - resolved_hostname varchar(128) NOT NULL, - resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE database_instance_analysis_changelog ( + changelog_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + analysis_timestamp timestamp not null default (''), + analysis varchar(128) NOT NULL, + PRIMARY KEY (changelog_id) +)`, ` - DROP INDEX resolved_timestamp_idx ON hostname_resolve +CREATE INDEX analysis_timestamp_idx_database_instance_analysis_changelog ON database_instance_analysis_changelog (analysis_timestamp) `, ` - CREATE INDEX resolved_timestamp_idx_hostname_resolve ON hostname_resolve (resolved_timestamp) - `, +DROP TABLE IF EXISTS node_health_history +`, ` - CREATE TABLE IF NOT EXISTS active_node ( - anchor tinyint unsigned NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (anchor) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE node_health_history ( + history_id integer, + hostname varchar(128) NOT NULL, + token varchar(128) NOT NULL, + first_seen_active timestamp NOT NULL, + extra_info varchar(128) NOT NULL, + command varchar(128) not null default '', + app_version varchar(64) NOT NULL DEFAULT "", + PRIMARY KEY (history_id) +)`, ` - INSERT IGNORE INTO active_node (anchor, hostname, token, last_seen_active) - VALUES (1, '', '', NOW()) +CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (first_seen_active) `, ` - CREATE TABLE IF NOT EXISTS node_health ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, token) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token) `, ` - DROP VIEW IF EXISTS _whats_wrong - `, +DROP TABLE IF EXISTS database_instance_coordinates_history +`, ` - DROP VIEW IF EXISTS whats_wrong - `, +CREATE TABLE database_instance_coordinates_history ( + history_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + recorded_timestamp timestamp not null default (''), + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint NOT NULL, + relay_log_file varchar(128) NOT NULL, + relay_log_pos bigint NOT NULL, + last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (history_id) +)`, ` - DROP VIEW IF EXISTS whats_wrong_summary +CREATE INDEX hostname_port_recorded_idx_database_instance_coordinates_history ON database_instance_coordinates_history (hostname, port, recorded_timestamp) `, ` - CREATE TABLE IF NOT EXISTS topology_recovery ( - recovery_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint unsigned NOT NULL, - in_active_period tinyint unsigned NOT NULL DEFAULT 0, - start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_active_period_unixtime int unsigned, - end_recovery timestamp NULL DEFAULT NULL, - processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL, - processcing_node_token varchar(128) NOT NULL, - successor_hostname varchar(128) DEFAULT NULL, - successor_port smallint unsigned DEFAULT NULL, - PRIMARY KEY (recovery_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX recorded_timestmp_idx_database_instance_coordinates_history ON database_instance_coordinates_history (recorded_timestamp) `, ` - DROP INDEX in_active_start_period_idx ON topology_recovery - `, +DROP TABLE IF EXISTS database_instance_binlog_files_history +`, ` - CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery (in_active_period, start_active_period) - `, +CREATE TABLE database_instance_binlog_files_history ( + history_id integer, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint NOT NULL, + first_seen timestamp not null default (''), + last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (history_id) +)`, ` - DROP INDEX start_active_period_idx ON topology_recovery +CREATE UNIQUE INDEX hostname_port_file_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (hostname, port, binary_log_file) `, ` - CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period) +CREATE INDEX last_seen_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (last_seen) `, ` - DROP INDEX hostname_port_active_period_uidx ON topology_recovery - `, +DROP TABLE IF EXISTS database_instance_recent_relaylog_history +`, ` - CREATE UNIQUE INDEX hostname_port_active_period_uidx_topology_recovery ON topology_recovery (hostname, port, in_active_period, end_active_period_unixtime) - `, +CREATE TABLE database_instance_recent_relaylog_history ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + current_relay_log_file varchar(128) NOT NULL, + current_relay_log_pos bigint NOT NULL, + current_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + prev_relay_log_file varchar(128) NOT NULL, + prev_relay_log_pos bigint NOT NULL, + prev_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (hostname, port) +)`, ` - CREATE TABLE IF NOT EXISTS hostname_unresolve ( - hostname varchar(128) NOT NULL, - unresolved_hostname varchar(128) NOT NULL, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX current_seen_idx_database_instance_recent_relaylog_history ON database_instance_recent_relaylog_history (current_seen) `, ` - DROP INDEX unresolved_hostname_idx ON hostname_unresolve - `, +DROP TABLE IF EXISTS vtorc_metadata +`, ` - CREATE INDEX unresolved_hostname_idx_hostname_unresolve ON hostname_unresolve (unresolved_hostname) - `, +CREATE TABLE vtorc_metadata ( + anchor tinyint NOT NULL, + last_deployed_version varchar(128) NOT NULL, + last_deployed_timestamp timestamp NOT NULL, + PRIMARY KEY (anchor) +)`, + ` +DROP TABLE IF EXISTS vtorc_db_deployments +`, ` - CREATE TABLE IF NOT EXISTS database_instance_topology_history ( - snapshot_unix_timestamp INT UNSIGNED NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - source_host varchar(128) CHARACTER SET ascii NOT NULL, - source_port smallint(5) unsigned NOT NULL, - cluster_name tinytext CHARACTER SET ascii NOT NULL, - PRIMARY KEY (snapshot_unix_timestamp, hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE vtorc_db_deployments ( + deployed_version varchar(128) NOT NULL, + deployed_timestamp timestamp NOT NULL, + PRIMARY KEY (deployed_version) +)`, + ` +DROP TABLE IF EXISTS global_recovery_disable +`, + ` +CREATE TABLE global_recovery_disable ( + disable_recovery tinyint NOT NULL , + PRIMARY KEY (disable_recovery) +)`, ` - DROP INDEX cluster_name_idx ON database_instance_topology_history - `, +DROP TABLE IF EXISTS topology_recovery_steps +`, ` - CREATE INDEX cluster_name_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, cluster_name(128)) - `, +CREATE TABLE topology_recovery_steps ( + recovery_step_id integer, + recovery_uid varchar(128) NOT NULL, + audit_at timestamp not null default (''), + message text NOT NULL, + PRIMARY KEY (recovery_step_id) +)`, + ` +DROP TABLE IF EXISTS raft_store +`, + ` +CREATE TABLE raft_store ( + store_id integer, + store_key varbinary(512) not null, + store_value blob not null, + PRIMARY KEY (store_id) +)`, + ` +CREATE INDEX store_key_idx_raft_store ON raft_store (store_key) + `, + ` +DROP TABLE IF EXISTS raft_log +`, + ` +CREATE TABLE raft_log ( + log_index integer, + term bigint not null, + log_type int not null, + data blob not null, + PRIMARY KEY (log_index) +)`, + ` +DROP TABLE IF EXISTS raft_snapshot +`, + ` +CREATE TABLE raft_snapshot ( + snapshot_id integer, + snapshot_name varchar(128) NOT NULL, + snapshot_meta varchar(4096) NOT NULL, + created_at timestamp not null default (''), + PRIMARY KEY (snapshot_id) +)`, ` - CREATE TABLE IF NOT EXISTS candidate_database_instance ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE UNIQUE INDEX snapshot_name_uidx_raft_snapshot ON raft_snapshot (snapshot_name) `, ` - DROP INDEX last_suggested_idx ON candidate_database_instance - `, +DROP TABLE IF EXISTS database_instance_peer_analysis +`, ` - CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested) - `, +CREATE TABLE database_instance_peer_analysis ( + peer varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + analysis_timestamp timestamp not null default (''), + analysis varchar(128) NOT NULL, + PRIMARY KEY (peer, hostname, port) +)`, ` - CREATE TABLE IF NOT EXISTS database_instance_downtime ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - downtime_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp DEFAULT CURRENT_TIMESTAMP, - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) CHARACTER SET utf8 NOT NULL, - reason text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +DROP TABLE IF EXISTS database_instance_tls +`, ` - CREATE TABLE IF NOT EXISTS topology_failure_detection ( - detection_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, - hostname varchar(128) NOT NULL, - port smallint unsigned NOT NULL, - in_active_period tinyint unsigned NOT NULL DEFAULT '0', - start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_active_period_unixtime int unsigned NOT NULL, - processing_node_hostname varchar(128) NOT NULL, - processcing_node_token varchar(128) NOT NULL, - analysis varchar(128) NOT NULL, - cluster_name varchar(128) NOT NULL, - count_affected_replicas int unsigned NOT NULL, - PRIMARY KEY (detection_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE database_instance_tls ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + required tinyint NOT NULL DEFAULT 0, + PRIMARY KEY (hostname,port) +)`, ` - DROP INDEX hostname_port_active_period_uidx ON topology_failure_detection - `, +DROP TABLE IF EXISTS hostname_ips +`, ` - DROP INDEX in_active_start_period_idx ON topology_failure_detection - `, +CREATE TABLE hostname_ips ( + hostname varchar(128) NOT NULL, + ipv4 varchar(128) NOT NULL, + ipv6 varchar(128) NOT NULL, + last_updated timestamp not null default (''), + PRIMARY KEY (hostname) +)`, ` - CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period) - `, +DROP TABLE IF EXISTS database_instance_tags +`, ` - CREATE TABLE IF NOT EXISTS hostname_resolve_history ( - resolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (resolved_hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +CREATE TABLE database_instance_tags ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + tag_name varchar(128) NOT NULL, + tag_value varchar(128) NOT NULL, + last_updated timestamp not null default (''), + PRIMARY KEY (hostname, port, tag_name) +)`, ` - DROP INDEX hostname ON hostname_resolve_history +CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name) `, ` - CREATE INDEX hostname_idx_hostname_resolve_history ON hostname_resolve_history (hostname) - `, +DROP TABLE IF EXISTS database_instance_stale_binlog_coordinates +`, ` - DROP INDEX resolved_timestamp_idx ON hostname_resolve_history - `, +CREATE TABLE database_instance_stale_binlog_coordinates ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint NOT NULL, + first_seen timestamp not null default (''), + PRIMARY KEY (hostname, port) +)`, ` - CREATE INDEX resolved_timestamp_idx_hostname_resolve_history ON hostname_resolve_history (resolved_timestamp) +CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen) `, ` - CREATE TABLE IF NOT EXISTS hostname_unresolve_history ( - unresolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (unresolved_hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +DROP TABLE IF EXISTS vitess_tablet +`, ` - DROP INDEX hostname ON hostname_unresolve_history - `, +CREATE TABLE vitess_tablet ( + hostname varchar(128) NOT NULL, + port smallint NOT NULL, + keyspace varchar(128) NOT NULL, + shard varchar(128) NOT NULL, + cell varchar(128) NOT NULL, + tablet_type smallint(5) NOT NULL, + primary_timestamp timestamp NOT NULL, + info varchar(512) NOT NULL, + PRIMARY KEY (hostname, port) +)`, ` - CREATE INDEX hostname_idx_hostname_unresolve_history ON hostname_unresolve_history (hostname) +CREATE INDEX cell_idx_vitess_tablet ON vitess_tablet (cell) `, ` - DROP INDEX last_registered_idx ON hostname_unresolve_history +CREATE INDEX ks_idx_vitess_tablet ON vitess_tablet (keyspace, shard) `, ` - CREATE INDEX last_registered_idx_hostname_unresolve_history ON hostname_unresolve_history (last_registered) - `, - ` - CREATE TABLE IF NOT EXISTS cluster_domain_name ( - cluster_name varchar(128) CHARACTER SET ascii NOT NULL, - domain_name varchar(128) NOT NULL, - PRIMARY KEY (cluster_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, +DROP TABLE IF EXISTS vitess_keyspace +`, ` - DROP INDEX domain_name_idx ON cluster_domain_name - `, - ` - CREATE INDEX domain_name_idx_cluster_domain_name ON cluster_domain_name (domain_name(32)) - `, - ` - CREATE TABLE IF NOT EXISTS primary_position_equivalence ( - equivalence_id bigint unsigned not null auto_increment, - primary1_hostname varchar(128) CHARACTER SET ascii NOT NULL, - primary1_port smallint(5) unsigned NOT NULL, - primary1_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - primary1_binary_log_pos bigint(20) unsigned NOT NULL, - primary2_hostname varchar(128) CHARACTER SET ascii NOT NULL, - primary2_port smallint(5) unsigned NOT NULL, - primary2_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - primary2_binary_log_pos bigint(20) unsigned NOT NULL, - last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (equivalence_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX equivalence_uidx ON primary_position_equivalence - `, - ` - CREATE UNIQUE INDEX equivalence_uidx_primary_position_equivalence ON primary_position_equivalence (primary1_hostname, primary1_port, primary1_binary_log_file, primary1_binary_log_pos, primary2_hostname, primary2_port) - `, - ` - DROP INDEX primary2_idx ON primary_position_equivalence - `, - ` - CREATE INDEX primary2_idx_primary_position_equivalence ON primary_position_equivalence (primary2_hostname, primary2_port, primary2_binary_log_file, primary2_binary_log_pos) - `, - ` - DROP INDEX last_suggested_idx ON primary_position_equivalence - `, - ` - CREATE INDEX last_suggested_idx_primary_position_equivalence ON primary_position_equivalence (last_suggested) - `, - ` - CREATE TABLE IF NOT EXISTS async_request ( - request_id bigint unsigned NOT NULL AUTO_INCREMENT, - command varchar(128) charset ascii not null, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - destination_hostname varchar(128) NOT NULL, - destination_port smallint(5) unsigned NOT NULL, - pattern text CHARACTER SET utf8 NOT NULL, - gtid_hint varchar(32) charset ascii not null, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - story text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (request_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX begin_timestamp_idx ON async_request - `, - ` - CREATE INDEX begin_timestamp_idx_async_request ON async_request (begin_timestamp) - `, - ` - DROP INDEX end_timestamp_idx ON async_request - `, - ` - CREATE INDEX end_timestamp_idx_async_request ON async_request (end_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS blocked_topology_recovery ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - cluster_name varchar(128) NOT NULL, - analysis varchar(128) NOT NULL, - last_blocked_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - blocking_recovery_id bigint unsigned, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX cluster_blocked_idx ON blocked_topology_recovery - `, - ` - CREATE INDEX cluster_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (cluster_name, last_blocked_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_last_analysis ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX analysis_timestamp_idx ON database_instance_last_analysis - `, - ` - CREATE INDEX analysis_timestamp_idx_database_instance_last_analysis ON database_instance_last_analysis (analysis_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_analysis_changelog ( - changelog_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (changelog_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX analysis_timestamp_idx ON database_instance_analysis_changelog - `, - ` - CREATE INDEX analysis_timestamp_idx_database_instance_analysis_changelog ON database_instance_analysis_changelog (analysis_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS node_health_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - first_seen_active timestamp NOT NULL, - extra_info varchar(128) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX first_seen_active_idx ON node_health_history - `, - ` - CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (first_seen_active) - `, - ` - DROP INDEX hostname_token_idx ON node_health_history - `, - ` - CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_coordinates_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - recorded_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - relay_log_file varchar(128) NOT NULL, - relay_log_pos bigint(20) unsigned NOT NULL, - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname_port_recorded_timestmp_idx ON database_instance_coordinates_history - `, - ` - CREATE INDEX hostname_port_recorded_idx_database_instance_coordinates_history ON database_instance_coordinates_history (hostname, port, recorded_timestamp) - `, - ` - DROP INDEX recorded_timestmp_idx ON database_instance_coordinates_history - `, - ` - CREATE INDEX recorded_timestmp_idx_database_instance_coordinates_history ON database_instance_coordinates_history (recorded_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_binlog_files_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname_port_file_idx ON database_instance_binlog_files_history - `, - ` - CREATE UNIQUE INDEX hostname_port_file_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (hostname, port, binary_log_file) - `, - ` - DROP INDEX last_seen_idx ON database_instance_binlog_files_history - `, - ` - CREATE INDEX last_seen_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (last_seen) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_recent_relaylog_history ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - current_relay_log_file varchar(128) NOT NULL, - current_relay_log_pos bigint(20) unsigned NOT NULL, - current_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - prev_relay_log_file varchar(128) NOT NULL, - prev_relay_log_pos bigint(20) unsigned NOT NULL, - prev_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX current_seen_idx ON database_instance_recent_relaylog_history - `, - ` - CREATE INDEX current_seen_idx_database_instance_recent_relaylog_history ON database_instance_recent_relaylog_history (current_seen) - `, - ` - CREATE TABLE IF NOT EXISTS vtorc_metadata ( - anchor tinyint unsigned NOT NULL, - last_deployed_version varchar(128) CHARACTER SET ascii NOT NULL, - last_deployed_timestamp timestamp NOT NULL, - PRIMARY KEY (anchor) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS vtorc_db_deployments ( - deployed_version varchar(128) CHARACTER SET ascii NOT NULL, - deployed_timestamp timestamp NOT NULL, - PRIMARY KEY (deployed_version) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS global_recovery_disable ( - disable_recovery tinyint unsigned NOT NULL COMMENT 'Insert 1 to disable recovery globally', - PRIMARY KEY (disable_recovery) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS topology_recovery_steps ( - recovery_step_id bigint unsigned not null auto_increment, - recovery_uid varchar(128) CHARACTER SET ascii NOT NULL, - audit_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - message text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (recovery_step_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS raft_store ( - store_id bigint unsigned not null auto_increment, - store_key varbinary(512) not null, - store_value blob not null, - PRIMARY KEY (store_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE INDEX store_key_idx_raft_store ON raft_store (store_key) - `, +CREATE TABLE vitess_keyspace ( + keyspace varchar(128) NOT NULL, + keyspace_type smallint(5) NOT NULL, + durability_policy varchar(512) NOT NULL, + PRIMARY KEY (keyspace) +)`, ` - CREATE TABLE IF NOT EXISTS raft_log ( - log_index bigint unsigned not null auto_increment, - term bigint not null, - log_type int not null, - data blob not null, - PRIMARY KEY (log_index) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX source_host_port_idx_database_instance_database_instance on database_instance (source_host, source_port) `, ` - CREATE TABLE IF NOT EXISTS raft_snapshot ( - snapshot_id bigint unsigned not null auto_increment, - snapshot_name varchar(128) CHARACTER SET utf8 NOT NULL, - snapshot_meta varchar(4096) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (snapshot_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX active_timestamp_idx_database_instance_maintenance on database_instance_maintenance (maintenance_active, begin_timestamp) `, ` - CREATE UNIQUE INDEX snapshot_name_uidx_raft_snapshot ON raft_snapshot (snapshot_name) +CREATE INDEX active_end_timestamp_idx_database_instance_maintenance on database_instance_maintenance (maintenance_active, end_timestamp) `, ` - CREATE TABLE IF NOT EXISTS database_instance_peer_analysis ( - peer varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (peer, hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX last_registered_idx_hostname_unresolve on hostname_unresolve (last_registered) `, ` - CREATE TABLE IF NOT EXISTS database_instance_tls ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - required tinyint unsigned NOT NULL DEFAULT 0, - PRIMARY KEY (hostname,port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX keyspace_shard_in_active_idx_topology_recovery on topology_recovery (keyspace, shard, in_active_period) `, ` - CREATE TABLE IF NOT EXISTS cluster_injected_pseudo_gtid ( - cluster_name varchar(128) NOT NULL, - time_injected timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (cluster_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX end_recovery_idx_topology_recovery on topology_recovery (end_recovery) `, ` - CREATE TABLE IF NOT EXISTS hostname_ips ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - ipv4 varchar(128) CHARACTER SET ascii NOT NULL, - ipv6 varchar(128) CHARACTER SET ascii NOT NULL, - last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX acknowledged_idx_topology_recovery on topology_recovery (acknowledged, acknowledged_at) `, ` - CREATE TABLE IF NOT EXISTS database_instance_tags ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - tag_name varchar(128) CHARACTER SET utf8 NOT NULL, - tag_value varchar(128) CHARACTER SET utf8 NOT NULL, - last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port, tag_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX last_blocked_idx_blocked_topology_recovery on blocked_topology_recovery (last_blocked_timestamp) `, ` - CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name) +CREATE INDEX instance_timestamp_idx_database_instance_analysis_changelog on database_instance_analysis_changelog (hostname, port, analysis_timestamp) `, ` - CREATE TABLE IF NOT EXISTS database_instance_stale_binlog_coordinates ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX last_detection_idx_topology_recovery on topology_recovery (last_detection_id) `, ` - CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen) +CREATE INDEX last_seen_active_idx_node_health on node_health (last_seen_active) `, ` - CREATE TABLE IF NOT EXISTS vitess_tablet ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - keyspace varchar(128) CHARACTER SET ascii NOT NULL, - shard varchar(128) CHARACTER SET ascii NOT NULL, - cell varchar(128) CHARACTER SET ascii NOT NULL, - tablet_type smallint(5) NOT NULL, - primary_timestamp timestamp NOT NULL, - info varchar(512) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid) `, ` - CREATE INDEX cell_idx_vitess_tablet ON vitess_tablet (cell) +CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid) `, ` - CREATE INDEX ks_idx_vitess_tablet ON vitess_tablet (keyspace, shard) +CREATE INDEX end_timestamp_idx_database_instance_downtime ON database_instance_downtime(end_timestamp) `, ` - CREATE TABLE IF NOT EXISTS vitess_keyspace ( - keyspace varchar(128) CHARACTER SET ascii NOT NULL, - keyspace_type smallint(5) NOT NULL, - durability_policy varchar(512) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (keyspace) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii +CREATE UNIQUE INDEX host_port_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (hostname, port, in_active_period, end_active_period_unixtime, is_actionable) `, } diff --git a/go/vt/vtorc/db/generate_patches.go b/go/vt/vtorc/db/generate_patches.go deleted file mode 100644 index 93099f51a57..00000000000 --- a/go/vt/vtorc/db/generate_patches.go +++ /dev/null @@ -1,559 +0,0 @@ -/* - Copyright 2017 Shlomi Noach, GitHub Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package db - -// generateSQLPatches contains DDLs for patching schema to the latest version. -// Add new statements at the end of the list so they form a changelog. -var generateSQLPatches = []string{ - ` - ALTER TABLE - database_instance - ADD COLUMN read_only TINYINT UNSIGNED NOT NULL AFTER version - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_sql_error TEXT NOT NULL AFTER exec_source_log_pos - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_io_error TEXT NOT NULL AFTER last_sql_error - `, - ` - ALTER TABLE - database_instance - ADD COLUMN oracle_gtid TINYINT UNSIGNED NOT NULL AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN mariadb_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN relay_log_file varchar(128) CHARACTER SET ascii NOT NULL AFTER exec_source_log_pos - `, - ` - ALTER TABLE - database_instance - ADD COLUMN relay_log_pos bigint unsigned NOT NULL AFTER relay_log_file - `, - ` - DROP INDEX source_host_port_idx ON database_instance - `, - ` - ALTER TABLE - database_instance - ADD INDEX source_host_port_idx_database_instance (source_host, source_port) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN pseudo_gtid TINYINT UNSIGNED NOT NULL AFTER mariadb_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_depth TINYINT UNSIGNED NOT NULL AFTER cluster_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN has_replication_filters TINYINT UNSIGNED NOT NULL AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN data_center varchar(32) CHARACTER SET ascii NOT NULL AFTER cluster_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN physical_environment varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center - `, - ` - ALTER TABLE - database_instance_maintenance - ADD KEY active_timestamp_idx (maintenance_active, begin_timestamp) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN is_co_primary TINYINT UNSIGNED NOT NULL AFTER replication_depth - `, - ` - ALTER TABLE - database_instance_maintenance - ADD KEY active_end_timestamp_idx (maintenance_active, end_timestamp) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN sql_delay INT UNSIGNED NOT NULL AFTER replica_lag_seconds - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN analysis varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN count_affected_replicas int unsigned NOT NULL - `, - ` - ALTER TABLE hostname_unresolve - ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE hostname_unresolve - ADD KEY last_registered_idx (last_registered) - `, - ` - ALTER TABLE topology_recovery - ADD KEY cluster_name_in_active_idx (cluster_name, in_active_period) - `, - ` - ALTER TABLE topology_recovery - ADD KEY end_recovery_idx (end_recovery) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN binlog_server TINYINT UNSIGNED NOT NULL AFTER version - `, - ` - ALTER TABLE cluster_domain_name - ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE cluster_domain_name - ADD KEY last_registered_idx (last_registered) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN supports_oracle_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN executed_gtid_set text CHARACTER SET ascii NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN server_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER server_id - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN is_successful TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER processcing_node_token - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged TINYINT UNSIGNED NOT NULL DEFAULT 0 - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged_by varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledge_comment text CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN participating_instances text CHARACTER SET ascii NOT NULL after count_affected_replicas - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN lost_replicas text CHARACTER SET ascii NOT NULL after participating_instances - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN all_errors text CHARACTER SET ascii NOT NULL after lost_replicas - `, - ` - ALTER TABLE audit - ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER port - `, - ` - ALTER TABLE candidate_database_instance - ADD COLUMN priority TINYINT SIGNED NOT NULL DEFAULT 1 comment 'positive promote, nagative unpromotes' - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged_at TIMESTAMP NULL after acknowledged - `, - ` - ALTER TABLE - topology_recovery - ADD KEY acknowledged_idx (acknowledged, acknowledged_at) - `, - ` - ALTER TABLE - blocked_topology_recovery - ADD KEY last_blocked_idx (last_blocked_timestamp) - `, - ` - ALTER TABLE candidate_database_instance - ADD COLUMN promotion_rule enum('must', 'prefer', 'neutral', 'prefer_not', 'must_not') NOT NULL DEFAULT 'neutral' - `, - ` - ALTER TABLE node_health /* sqlite3-skip */ - DROP PRIMARY KEY, - ADD PRIMARY KEY (hostname, token) - `, - ` - ALTER TABLE node_health - ADD COLUMN extra_info varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE agent_seed /* sqlite3-skip */ - MODIFY end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE active_node /* sqlite3-skip */ - MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - - ` - ALTER TABLE node_health /* sqlite3-skip */ - MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE candidate_database_instance /* sqlite3-skip */ - MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE primary_position_equivalence /* sqlite3-skip */ - MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER last_checked - `, - ` - ALTER TABLE - database_instance /* sqlite3-skip */ - MODIFY last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE - database_instance_analysis_changelog - ADD KEY instance_timestamp_idx (hostname, port, analysis_timestamp) - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN last_detection_id bigint unsigned NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD KEY last_detection_idx (last_detection_id) - `, - ` - ALTER TABLE node_health_history - ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE node_health - ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE database_instance_topology_history - ADD COLUMN version varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_purged text CHARACTER SET ascii NOT NULL AFTER executed_gtid_set - `, - ` - ALTER TABLE - database_instance_coordinates_history - ADD COLUMN last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER recorded_timestamp - `, - ` - ALTER TABLE - database_instance - ADD COLUMN has_replication_credentials TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN allow_tls TINYINT UNSIGNED NOT NULL AFTER sql_delay - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_enforced TINYINT UNSIGNED NOT NULL AFTER physical_environment - `, - ` - ALTER TABLE - database_instance - ADD COLUMN instance_alias varchar(128) CHARACTER SET ascii NOT NULL AFTER physical_environment - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN successor_alias varchar(128) DEFAULT NULL - `, - ` - ALTER TABLE - database_instance /* sqlite3-skip */ - MODIFY cluster_name varchar(128) NOT NULL - `, - ` - ALTER TABLE - node_health - ADD INDEX last_seen_active_idx (last_seen_active) - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN processing_node_token varchar(128) NOT NULL - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN explicitly_bounded TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE node_health_history - ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health - ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health_history /* sqlite3-skip */ - MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health /* sqlite3-skip */ - MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE - database_instance - ADD COLUMN version_comment varchar(128) NOT NULL DEFAULT '' - `, - ` - ALTER TABLE active_node - ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE node_health - ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE database_instance - ADD COLUMN major_version varchar(16) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN binlog_row_image varchar(16) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE topology_recovery - ADD COLUMN uid varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid) - `, - ` - CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_discovery_latency bigint not null - `, - ` - CREATE INDEX end_timestamp_idx_database_instance_downtime ON database_instance_downtime(end_timestamp) - `, - ` - ALTER TABLE - topology_failure_detection - ADD COLUMN is_actionable tinyint not null default 0 - `, - ` - DROP INDEX hostname_port_active_period_uidx_topology_failure_detection ON topology_failure_detection - `, - ` - CREATE UNIQUE INDEX host_port_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (hostname, port, in_active_period, end_active_period_unixtime, is_actionable) - `, - ` - ALTER TABLE raft_snapshot - ADD COLUMN created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE node_health - ADD COLUMN db_backend varchar(255) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health - ADD COLUMN incrementing_indicator bigint not null default 0 - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_enabled TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_replica_enabled TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_mode varchar(32) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_check_partial_success tinyint unsigned NOT NULL after last_attempted_check - `, - ` - ALTER TABLE - database_instance - ADD COLUMN source_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_errant text CHARACTER SET ascii NOT NULL AFTER gtid_purged - `, - ` - ALTER TABLE - database_instance - ADD COLUMN ancestry_uuid text CHARACTER SET ascii NOT NULL AFTER source_uuid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_sql_thread_state tinyint signed not null default 0 AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_io_thread_state tinyint signed not null default 0 AFTER replication_sql_thread_state - `, - ` - ALTER TABLE - database_instance_tags /* sqlite3-skip */ - DROP PRIMARY KEY, - ADD PRIMARY KEY (hostname, port, tag_name) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN region varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_timeout INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_enabled - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_wait_for_replica_count INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_timeout - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_wait_for_replica_count - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_replica_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_status - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_clients INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_status - `, - ` - ALTER TABLE /* sqlite3-skip */ - database_instance - MODIFY semi_sync_primary_timeout BIGINT UNSIGNED NOT NULL DEFAULT 0 - `, - // Fields related to Replication Group the instance belongs to - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_name VARCHAR(64) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER gtid_mode - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_is_single_primary_mode TINYINT UNSIGNED NOT NULL DEFAULT 1 AFTER replication_group_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_member_state VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_is_single_primary_mode - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_member_role VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_member_state - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_members text CHARACTER SET ascii NOT NULL AFTER replication_group_member_role - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_primary_host varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_members - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_primary_port smallint(5) unsigned NOT NULL DEFAULT 0 AFTER replication_group_primary_host - `, -} diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 3a8d13991a0..21ffdbe4fe8 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -62,11 +62,11 @@ type clusterAnalysis struct { } // GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc) -func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { +func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { result := []ReplicationAnalysis{} // TODO(sougou); deprecate ReduceReplicationAnalysisCount - args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName) + args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, keyspace, shard) query := ` SELECT vitess_tablet.info AS tablet_info, @@ -85,7 +85,6 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) MIN(primary_instance.physical_environment) AS physical_environment, MIN(primary_instance.source_host) AS source_host, MIN(primary_instance.source_port) AS source_port, - MIN(primary_instance.cluster_name) AS cluster_name, MIN(primary_instance.binary_log_file) AS binary_log_file, MIN(primary_instance.binary_log_pos) AS binary_log_pos, MIN(primary_tablet.info) AS primary_tablet_info, @@ -97,12 +96,6 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) 0 ) ) AS is_stale_binlog_coordinates, - MIN( - IFNULL( - cluster_domain_name.domain_name, - primary_instance.cluster_name - ) - ) AS cluster_domain, MIN( primary_instance.last_checked <= primary_instance.last_seen and primary_instance.last_attempted_check <= primary_instance.last_seen + interval ? second @@ -332,12 +325,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) AND replica_instance.port = replica_downtime.port AND replica_downtime.downtime_active = 1 ) - LEFT JOIN cluster_domain_name ON ( - cluster_domain_name.cluster_name = primary_instance.cluster_name - ) WHERE database_instance_maintenance.database_instance_maintenance_id IS NULL - AND ? IN ('', primary_instance.cluster_name) + AND ? IN ('', vitess_keyspace.keyspace) + AND ? IN ('', vitess_tablet.shard) GROUP BY vitess_tablet.hostname, vitess_tablet.port @@ -392,8 +383,8 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) Type: BinaryLog, } isStaleBinlogCoordinates := m.GetBool("is_stale_binlog_coordinates") - a.ClusterDetails.ClusterName = m.GetString("cluster_name") - a.ClusterDetails.ClusterDomain = m.GetString("cluster_domain") + a.ClusterDetails.Keyspace = m.GetString("keyspace") + a.ClusterDetails.Shard = m.GetString("shard") a.GTIDMode = m.GetString("gtid_mode") a.LastCheckValid = m.GetBool("is_last_check_valid") a.LastCheckPartialSuccess = m.GetBool("last_check_partial_success") @@ -441,18 +432,19 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.IsReadOnly = m.GetUint("read_only") == 1 if !a.LastCheckValid { - analysisMessage := fmt.Sprintf("analysis: ClusterName: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v", - a.ClusterDetails.ClusterName, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary, + analysisMessage := fmt.Sprintf("analysis: Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v", + a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary, ) if util.ClearToLog("analysis_dao", analysisMessage) { log.Infof(analysisMessage) } } - if clusters[a.ClusterDetails.ClusterName] == nil { - clusters[a.ClusterDetails.ClusterName] = &clusterAnalysis{} + keyspaceShard := getKeyspaceShardName(a.ClusterDetails.Keyspace, a.ClusterDetails.Shard) + if clusters[keyspaceShard] == nil { + clusters[keyspaceShard] = &clusterAnalysis{} if a.TabletType == topodatapb.TabletType_PRIMARY { a.IsClusterPrimary = true - clusters[a.ClusterDetails.ClusterName].primaryKey = &a.AnalyzedInstanceKey + clusters[keyspaceShard].primaryKey = &a.AnalyzedInstanceKey } durabilityPolicy := m.GetString("durability_policy") if durabilityPolicy == "" { @@ -464,10 +456,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) log.Errorf("can't get the durability policy %v - %v. Skipping keyspace - %v.", durabilityPolicy, err, a.AnalyzedKeyspace) return nil } - clusters[a.ClusterDetails.ClusterName].durability = durability + clusters[keyspaceShard].durability = durability } // ca has clusterwide info - ca := clusters[a.ClusterDetails.ClusterName] + ca := clusters[keyspaceShard] if ca.hasClusterwideAction { // We can only take one cluster level action at a time. return nil diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index 1ee0d2a5ba5..ce293bd5051 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -577,7 +577,7 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { } db.Db = test.NewTestDB([][]sqlutils.RowMap{rowMaps}) - got, err := GetReplicationAnalysis("", &ReplicationAnalysisHints{}) + got, err := GetReplicationAnalysis("", "", &ReplicationAnalysisHints{}) if tt.wantErr != "" { require.EqualError(t, err, tt.wantErr) return diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index fc0d4ca4bf8..7882449c655 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -54,9 +54,10 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) if instanceKey == nil { instanceKey = &InstanceKey{} } - clusterName := "" + keyspace := "" + shard := "" if instanceKey.Hostname != "" { - clusterName, _ = GetClusterName(instanceKey) + keyspace, shard, _ = GetKeyspaceShardName(instanceKey) } auditWrittenToFile := false @@ -70,7 +71,7 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) } defer f.Close() - text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s]\t%s\t\n", time.Now().Format("2006-01-02 15:04:05"), auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message) + text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s:%s]\t%s\t\n", time.Now().Format("2006-01-02 15:04:05"), auditType, instanceKey.Hostname, instanceKey.Port, keyspace, shard, message) if _, err = f.WriteString(text); err != nil { log.Error(err) } @@ -80,15 +81,16 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) _, err := db.ExecVTOrc(` insert into audit ( - audit_timestamp, audit_type, hostname, port, cluster_name, message + audit_timestamp, audit_type, hostname, port, keyspace, shard, message ) VALUES ( - NOW(), ?, ?, ?, ?, ? + NOW(), ?, ?, ?, ?, ?, ? ) `, auditType, instanceKey.Hostname, instanceKey.Port, - clusterName, + keyspace, + shard, message, ) if err != nil { @@ -96,7 +98,7 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) return err } } - logMessage := fmt.Sprintf("auditType:%s instance:%s cluster:%s message:%s", auditType, instanceKey.DisplayString(), clusterName, message) + logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, instanceKey.DisplayString(), keyspace, shard, message) if syslogWriter != nil { auditWrittenToFile = true go func() { diff --git a/go/vt/vtorc/inst/audit_dao_test.go b/go/vt/vtorc/inst/audit_dao_test.go new file mode 100644 index 00000000000..4a6533077c2 --- /dev/null +++ b/go/vt/vtorc/inst/audit_dao_test.go @@ -0,0 +1,117 @@ +/* +Copyright 2022 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inst + +import ( + "os" + "testing" + "time" + + "github.com/stretchr/testify/require" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vtorc/config" + "vitess.io/vitess/go/vt/vtorc/db" +) + +// TestAuditOperation tests that auditing a operation works as intended based on the configurations. +func TestAuditOperation(t *testing.T) { + // Restore original configurations + originalAuditSysLog := config.Config.AuditToSyslog + originalAuditLogFile := config.Config.AuditLogFile + originalAuditBackend := config.Config.AuditToBackendDB + defer func() { + config.Config.AuditToSyslog = originalAuditSysLog + config.Config.AuditLogFile = originalAuditLogFile + config.Config.AuditToBackendDB = originalAuditBackend + }() + + orcDb, err := db.OpenVTOrc() + require.NoError(t, err) + defer func() { + _, err = orcDb.Exec("delete from audit") + require.NoError(t, err) + _, err = orcDb.Exec("delete from vitess_tablet") + require.NoError(t, err) + }() + + // Store a tablet in the database + ks := "ks" + shard := "0" + hostname := "localhost" + var port int32 = 100 + tab100 := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Hostname: hostname, + Keyspace: ks, + Shard: shard, + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: hostname, + MysqlPort: port, + } + err = SaveTablet(tab100) + require.NoError(t, err) + + instance100 := &InstanceKey{ + Hostname: hostname, + Port: int(port), + } + auditType := "test-audit-operation" + message := "test-message" + + t.Run("Audit to backend", func(t *testing.T) { + config.Config.AuditLogFile = "" + config.Config.AuditToSyslog = false + config.Config.AuditToBackendDB = true + + // Auditing should succeed as expected + err = AuditOperation(auditType, instance100, message) + require.NoError(t, err) + + audits, err := ReadRecentAudit(instance100, 0) + require.NoError(t, err) + require.Len(t, audits, 1) + require.EqualValues(t, 1, audits[0].AuditID) + require.EqualValues(t, auditType, audits[0].AuditType) + require.EqualValues(t, message, audits[0].Message) + require.EqualValues(t, *instance100, audits[0].AuditInstanceKey) + }) + + t.Run("Audit to File", func(t *testing.T) { + config.Config.AuditToBackendDB = false + config.Config.AuditToSyslog = false + + file, err := os.CreateTemp("", "test-auditing-*") + require.NoError(t, err) + defer os.Remove(file.Name()) + config.Config.AuditLogFile = file.Name() + + err = AuditOperation(auditType, instance100, message) + require.NoError(t, err) + + // Give a little time for the write to succeed since it happens in a separate go-routine + // There is no way to wait for that write to complete. This sleep is required to prevent this test from + // becoming flaky wherein we sometimes read the file before the contents are written. + time.Sleep(100 * time.Millisecond) + fileContent, err := os.ReadFile(file.Name()) + require.NoError(t, err) + require.Contains(t, string(fileContent), "\ttest-audit-operation\tlocalhost\t100\t[ks:0]\ttest-message") + }) +} diff --git a/go/vt/vtorc/inst/cluster.go b/go/vt/vtorc/inst/cluster.go index 805b25d7af4..c3a77485e74 100644 --- a/go/vt/vtorc/inst/cluster.go +++ b/go/vt/vtorc/inst/cluster.go @@ -18,8 +18,8 @@ package inst // ClusterInfo makes for a cluster status/info summary type ClusterInfo struct { - ClusterName string - ClusterDomain string // CNAME/VIP/A-record/whatever of the primary of this cluster + Keyspace string + Shard string CountInstances uint HeuristicLag int64 HasAutomatedPrimaryRecovery bool diff --git a/go/vt/vtorc/inst/cluster_domain_dao.go b/go/vt/vtorc/inst/cluster_domain_dao.go deleted file mode 100644 index 45aed648be1..00000000000 --- a/go/vt/vtorc/inst/cluster_domain_dao.go +++ /dev/null @@ -1,60 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// WriteClusterDomainName will write (and override) the domain name of a cluster -func WriteClusterDomainName(clusterName string, domainName string) error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - insert into - cluster_domain_name (cluster_name, domain_name, last_registered) - values - (?, ?, NOW()) - on duplicate key update - domain_name=values(domain_name), - last_registered=values(last_registered) - `, - clusterName, domainName) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} - -// ExpireClusterDomainName expires cluster_domain_name entries that haven't been updated recently. -func ExpireClusterDomainName() error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - delete from cluster_domain_name - where last_registered < NOW() - INTERVAL ? MINUTE - `, config.ExpiryHostnameResolvesMinutes, - ) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} diff --git a/go/vt/vtorc/inst/downtime_dao.go b/go/vt/vtorc/inst/downtime_dao.go index 1d5c33873b5..53b12e325e8 100644 --- a/go/vt/vtorc/inst/downtime_dao.go +++ b/go/vt/vtorc/inst/downtime_dao.go @@ -135,7 +135,7 @@ func renewLostInRecoveryDowntime() error { // expireLostInRecoveryDowntime expires downtime for servers who have been lost in recovery in the last, // but are now replicating. func expireLostInRecoveryDowntime() error { - instances, err := ReadLostInRecoveryInstances("") + instances, err := ReadLostInRecoveryInstances("", "") if err != nil { return err } diff --git a/go/vt/vtorc/inst/instance.go b/go/vt/vtorc/inst/instance.go index d4b8c7bfe88..dd1526ff090 100644 --- a/go/vt/vtorc/inst/instance.go +++ b/go/vt/vtorc/inst/instance.go @@ -74,7 +74,6 @@ type Instance struct { primaryExecutedGtidSet string // Not exported ReplicationLagSeconds sql.NullInt64 - ClusterName string DataCenter string Region string PhysicalEnvironment string diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index c51ca63e8d0..a799e4e3cb4 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -70,8 +70,6 @@ const ( GroupReplicationMemberStateError = "ERROR" ) -// instanceKeyInformativeClusterName is a non-authoritative cache; used for auditing or general purpose. -var instanceKeyInformativeClusterName *cache.Cache var forgetInstanceKeys *cache.Cache var accessDeniedCounter = metrics.NewCounter() @@ -96,7 +94,6 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() - instanceKeyInformativeClusterName = cache.New(time.Duration(config.Config.InstancePollSeconds/2)*time.Second, time.Second) forgetInstanceKeys = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) } @@ -223,7 +220,6 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch if err != nil { goto Cleanup } - instance.ClusterName = GetClusterNameFromKeyspaceAndShard(tablet.Keyspace, tablet.Shard) fullStatus, err = FullStatus(*instanceKey) if err != nil { @@ -459,8 +455,8 @@ Cleanup: return nil, err } -// GetClusterNameFromKeyspaceAndShard returns the cluster name from keyspace and shard -func GetClusterNameFromKeyspaceAndShard(keyspace, shard string) string { +// getKeyspaceShardName returns a single string having both the keyspace and shard +func getKeyspaceShardName(keyspace, shard string) string { return fmt.Sprintf("%v:%v", keyspace, shard) } @@ -512,7 +508,6 @@ func ReadInstanceClusterAttributes(instance *Instance) (err error) { var primaryOrGroupPrimaryExecutedGtidSet string primaryOrGroupPrimaryDataFound := false - // Read the cluster_name of the _primary_ or _group_primary_ of our instance, derive it from there. query := ` select replication_depth, @@ -609,7 +604,6 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.SecondsBehindPrimary = m.GetNullInt64("replication_lag_seconds") instance.ReplicationLagSeconds = m.GetNullInt64("replica_lag_seconds") instance.SQLDelay = m.GetUint("sql_delay") - instance.ClusterName = m.GetString("cluster_name") instance.DataCenter = m.GetString("data_center") instance.Region = m.GetString("region") instance.PhysicalEnvironment = m.GetString("physical_environment") @@ -779,9 +773,10 @@ func ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryKey *InstanceKe } // ReadProblemInstances reads all instances with problems -func ReadProblemInstances(clusterName string) ([](*Instance), error) { +func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) { condition := ` - cluster_name LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) + keyspace LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) + and shard LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) and ( (last_seen < last_checked) or (unix_timestamp() - unix_timestamp(last_checked) > ?) @@ -794,7 +789,7 @@ func ReadProblemInstances(clusterName string) ([](*Instance), error) { ) ` - args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) + args := sqlutils.Args(keyspace, keyspace, shard, shard, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) instances, err := readInstancesByCondition(condition, args, "") if err != nil { return instances, err @@ -814,15 +809,16 @@ func ReadProblemInstances(clusterName string) ([](*Instance), error) { // ReadLostInRecoveryInstances returns all instances (potentially filtered by cluster) // which are currently indicated as downtimed due to being lost during a topology recovery. -func ReadLostInRecoveryInstances(clusterName string) ([](*Instance), error) { +func ReadLostInRecoveryInstances(keyspace string, shard string) ([](*Instance), error) { condition := ` ifnull( database_instance_downtime.downtime_active = 1 and database_instance_downtime.end_timestamp > now() and database_instance_downtime.reason = ?, 0) - and ? IN ('', cluster_name) + and ? IN ('', keyspace) + and ? IN ('', shard) ` - return readInstancesByCondition(condition, sqlutils.Args(DowntimeLostInRecoveryMessage, clusterName), "cluster_name asc, replication_depth asc") + return readInstancesByCondition(condition, sqlutils.Args(DowntimeLostInRecoveryMessage, keyspace, shard), "keyspace asc, shard asc, replication_depth asc") } // readUnseenPrimaryKeys will read list of primaries that have never been seen, and yet whose replicas @@ -863,46 +859,6 @@ func readUnseenPrimaryKeys() ([]InstanceKey, error) { return res, nil } -// InjectSeed: intented to be used to inject an instance upon startup, assuming it's not already known to vtorc. -func InjectSeed(instanceKey *InstanceKey) error { - if instanceKey == nil { - return fmt.Errorf("InjectSeed: nil instanceKey") - } - clusterName := instanceKey.StringCode() - // minimal details: - instance := &Instance{Key: *instanceKey, Version: "Unknown", ClusterName: clusterName} - instance.SetSeed() - err := WriteInstance(instance, false, nil) - log.Infof("InjectSeed: %+v, %+v", *instanceKey, err) - _ = AuditOperation("inject-seed", instanceKey, "injected") - return err -} - -// InjectUnseenPrimaries will review primaries of instances that are known to be replicating, yet which are not listed -// in database_instance. Since their replicas are listed as replicating, we can assume that such primaries actually do -// exist: we shall therefore inject them with minimal details into the database_instance table. -func InjectUnseenPrimaries() error { - - unseenPrimaryKeys, err := readUnseenPrimaryKeys() - if err != nil { - return err - } - - operations := 0 - for _, primaryKey := range unseenPrimaryKeys { - primaryKey := primaryKey - clusterName := primaryKey.StringCode() - // minimal details: - instance := Instance{Key: primaryKey, Version: "Unknown", ClusterName: clusterName} - if err := WriteInstance(&instance, false, nil); err == nil { - operations++ - } - } - - _ = AuditOperation("inject-unseen-primaries", nil, fmt.Sprintf("Operations: %d", operations)) - return err -} - // ForgetUnseenInstancesDifferentlyResolved will purge instances which are invalid, and whose hostname // appears on the hostname_resolved table; this means some time in the past their hostname was unresovled, and now // resovled to a different value; the old hostname is never accessed anymore and the old entry should be removed. @@ -999,28 +955,27 @@ func ResolveUnknownPrimaryHostnameResolves() error { return err } -func GetClusterName(instanceKey *InstanceKey) (clusterName string, err error) { - if clusterName, found := instanceKeyInformativeClusterName.Get(instanceKey.StringCode()); found { - return clusterName.(string), nil - } +// GetKeyspaceShardName gets the keyspace shard name for the given instance key +func GetKeyspaceShardName(instanceKey *InstanceKey) (keyspace string, shard string, err error) { query := ` select - ifnull(max(cluster_name), '') as cluster_name + keyspace, + shard from - database_instance + vitess_tablet where hostname = ? and port = ? ` err = db.QueryVTOrc(query, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), func(m sqlutils.RowMap) error { - clusterName = m.GetString("cluster_name") - instanceKeyInformativeClusterName.Set(instanceKey.StringCode(), clusterName, cache.DefaultExpiration) + keyspace = m.GetString("keyspace") + shard = m.GetString("shard") return nil }) if err != nil { log.Error(err) } - return clusterName, err + return keyspace, shard, err } // ReadOutdatedInstanceKeys reads and returns keys for all instances that are not up to date (i.e. @@ -1177,7 +1132,6 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo "replication_lag_seconds", "replica_lag_seconds", "sql_delay", - "cluster_name", "data_center", "region", "physical_environment", @@ -1264,7 +1218,6 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo args = append(args, instance.SecondsBehindPrimary) args = append(args, instance.ReplicationLagSeconds) args = append(args, instance.SQLDelay) - args = append(args, instance.ClusterName) args = append(args, instance.DataCenter) args = append(args, instance.Region) args = append(args, instance.PhysicalEnvironment) @@ -1454,10 +1407,10 @@ func SnapshotTopologies() error { _, err := db.ExecVTOrc(` insert ignore into database_instance_topology_history (snapshot_unix_timestamp, - hostname, port, source_host, source_port, cluster_name, version) + hostname, port, source_host, source_port, version) select UNIX_TIMESTAMP(NOW()), - hostname, port, source_host, source_port, cluster_name, version + hostname, port, source_host, source_port, version from database_instance `, diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 5d2ba5a73ce..71d0ed94ff9 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -8,6 +8,9 @@ import ( "testing" "github.com/stretchr/testify/require" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vtorc/db" ) var ( @@ -60,17 +63,17 @@ func TestMkInsertOdkuSingle(t *testing.T) { version, major_version, version_comment, binlog_server, read_only, binlog_format, binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, - source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, cluster_name, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) + source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) VALUES - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) ON DUPLICATE KEY UPDATE - hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), cluster_name=VALUES(cluster_name), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), + hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) ` a1 := `i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, - false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, ` + false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, ` sql1, args1, err := mkInsertOdkuForInstances(instances[:1], false, true) require.NoError(t, err) @@ -83,22 +86,22 @@ func TestMkInsertOdkuThree(t *testing.T) { // three instances s3 := `INSERT INTO database_instance - (hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, version, major_version, version_comment, binlog_server, read_only, binlog_format, binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, cluster_name, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, + (hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, version, major_version, version_comment, binlog_server, read_only, binlog_format, binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) VALUES - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) ON DUPLICATE KEY UPDATE - hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), cluster_name=VALUES(cluster_name), data_center=VALUES(data_center), region=VALUES(region), + hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) ` a3 := ` - i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, - i720, 3306, 720, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 20, , 0, , , {0 false}, {0 false}, 0, , , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, - i730, 3306, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, , , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + i720, 3306, 720, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 20, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + i730, 3306, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, ` sql3, args3, err := mkInsertOdkuForInstances(instances[:3], true, true) @@ -115,3 +118,40 @@ func fmtArgs(args []any) string { } return b.String() } + +func TestGetKeyspaceShardName(t *testing.T) { + orcDb, err := db.OpenVTOrc() + require.NoError(t, err) + defer func() { + _, err = orcDb.Exec("delete from vitess_tablet") + require.NoError(t, err) + }() + + ks := "ks" + shard := "0" + hostname := "localhost" + var port int32 = 100 + tab100 := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Hostname: hostname, + Keyspace: ks, + Shard: shard, + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: hostname, + MysqlPort: port, + } + + err = SaveTablet(tab100) + require.NoError(t, err) + + keyspaceRead, shardRead, err := GetKeyspaceShardName(&InstanceKey{ + Hostname: hostname, + Port: int(port), + }) + require.NoError(t, err) + require.Equal(t, ks, keyspaceRead) + require.Equal(t, shard, shardRead) +} diff --git a/go/vt/vtorc/inst/minimal_instance.go b/go/vt/vtorc/inst/minimal_instance.go deleted file mode 100644 index 1eeb85663d3..00000000000 --- a/go/vt/vtorc/inst/minimal_instance.go +++ /dev/null @@ -1,15 +0,0 @@ -package inst - -type MinimalInstance struct { - Key InstanceKey - PrimaryKey InstanceKey - ClusterName string -} - -func (minimalInstance *MinimalInstance) ToInstance() *Instance { - return &Instance{ - Key: minimalInstance.Key, - SourceKey: minimalInstance.PrimaryKey, - ClusterName: minimalInstance.ClusterName, - } -} diff --git a/go/vt/vtorc/logic/orchestrator.go b/go/vt/vtorc/logic/orchestrator.go index 77e0fd30993..dcc30027392 100644 --- a/go/vt/vtorc/logic/orchestrator.go +++ b/go/vt/vtorc/logic/orchestrator.go @@ -385,7 +385,6 @@ func ContinuousDiscovery() { // Various periodic internal maintenance tasks go func() { if IsLeaderOrActive() { - go inst.InjectUnseenPrimaries() go inst.ForgetLongUnseenInstances() go inst.ForgetUnseenInstancesDifferentlyResolved() @@ -395,7 +394,6 @@ func ContinuousDiscovery() { go inst.ExpireMaintenance() go inst.ExpireCandidateInstances() go inst.ExpireHostnameUnresolve() - go inst.ExpireClusterDomainName() go inst.ExpireAudit() go inst.FlushNontrivialResolveCacheToDatabase() go inst.ExpireStaleInstanceBinlogCoordinates() diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index e6cf71f101c..51fe5e7c2b9 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -72,9 +72,6 @@ func OpenTabletDiscovery() <-chan time.Time { if _, err := db.ExecVTOrc("delete from vitess_tablet"); err != nil { log.Error(err) } - refreshTabletsUsing(func(instanceKey *inst.InstanceKey) { - _ = inst.InjectSeed(instanceKey) - }, false /* forceRefresh */) return time.Tick(time.Second * time.Duration(config.Config.TopoInformationRefreshSeconds)) //nolint SA1015: using time.Tick leaks the underlying ticker } diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index db77cac42ca..8a56cc9235b 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -34,7 +34,6 @@ import ( "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtctl/reparentutil" - "vitess.io/vitess/go/vt/vtorc/attributes" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/inst" "vitess.io/vitess/go/vt/vtorc/util" @@ -100,7 +99,6 @@ type RecoveryAcknowledgement struct { Comment string Key inst.InstanceKey - ClusterName string ID int64 UID string AllRecoveries bool @@ -109,7 +107,6 @@ type RecoveryAcknowledgement struct { // BlockedTopologyRecovery represents an entry in the blocked_topology_recovery table type BlockedTopologyRecovery struct { FailedInstanceKey inst.InstanceKey - ClusterName string Analysis inst.AnalysisCode LastBlockedTimestamp string BlockingRecoveryID int64 @@ -330,8 +327,6 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.Re if promotedReplica != nil { // Success! _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadPrimary: successfully promoted %+v", promotedReplica.Key)) - - _ = attributes.SetGeneralAttribute(analysisEntry.ClusterDetails.ClusterDomain, promotedReplica.Key.StringCode()) } } @@ -753,7 +748,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand // checkIfAlreadyFixed checks whether the problem that the analysis entry represents has already been fixed by another agent or not func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { // Run a replication analysis again. We will check if the problem persisted - analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.ClusterName, &inst.ReplicationAnalysisHints{}) + analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, &inst.ReplicationAnalysisHints{}) if err != nil { return false, err } @@ -772,7 +767,7 @@ func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { // CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { // Allow the analysis to run even if we don't want to recover - replicationAnalysis, err := inst.GetReplicationAnalysis("", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) + replicationAnalysis, err := inst.GetReplicationAnalysis("", "", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) if err != nil { log.Error(err) return false, nil, err @@ -823,7 +818,6 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.Re if promotedReplica != nil { // Success! _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.Key)) - _ = attributes.SetGeneralAttribute(analysisEntry.ClusterDetails.ClusterDomain, promotedReplica.Key.StringCode()) } } @@ -834,7 +828,7 @@ func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - log.Infof("Analysis: %v, will elect a new primary: %v", analysisEntry.Analysis, analysisEntry.ClusterDetails.ClusterName) + log.Infof("Analysis: %v, will elect a new primary for %v:%v", analysisEntry.Analysis, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard) var promotedReplica *inst.Instance // This has to be done in the end; whether successful or not, we should mark that the recovery is done. diff --git a/go/vt/vtorc/logic/topology_recovery_dao.go b/go/vt/vtorc/logic/topology_recovery_dao.go index 0a24d507793..f2254fe542e 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao.go +++ b/go/vt/vtorc/logic/topology_recovery_dao.go @@ -37,7 +37,8 @@ func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis process.ThisHostname, util.ProcessToken.Hash, string(analysisEntry.Analysis), - analysisEntry.ClusterDetails.ClusterName, + analysisEntry.ClusterDetails.Keyspace, + analysisEntry.ClusterDetails.Shard, analysisEntry.CountReplicas, analysisEntry.IsActionableRecovery, ) @@ -57,7 +58,8 @@ func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis processing_node_hostname, processcing_node_token, analysis, - cluster_name, + keyspace, + shard, count_affected_replicas, is_actionable, start_active_period @@ -72,6 +74,7 @@ func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis ?, ?, ?, + ?, %s ) `, startActivePeriodHint) @@ -123,7 +126,8 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover processing_node_hostname, processcing_node_token, analysis, - cluster_name, + keyspace, + shard, count_affected_replicas, last_detection_id ) values ( @@ -139,6 +143,7 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover ?, ?, ?, + ?, (select ifnull(max(detection_id), 0) from topology_failure_detection where hostname=? and port=?) ) `, @@ -147,7 +152,8 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, process.ThisHostname, util.ProcessToken.Hash, string(analysisEntry.Analysis), - analysisEntry.ClusterDetails.ClusterName, + analysisEntry.ClusterDetails.Keyspace, + analysisEntry.ClusterDetails.Shard, analysisEntry.CountReplicas, analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, ) @@ -189,14 +195,14 @@ func AttemptRecoveryRegistration(analysisEntry *inst.ReplicationAnalysis, failIf if failIfClusterInActiveRecovery { // Let's check if this cluster has just experienced a failover and is still in active period. // If so, we reject recovery registration to avoid flapping. - recoveries, err := ReadInActivePeriodClusterRecovery(analysisEntry.ClusterDetails.ClusterName) + recoveries, err := ReadInActivePeriodClusterRecovery(analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard) if err != nil { log.Error(err) return nil, err } if len(recoveries) > 0 { _ = RegisterBlockedRecoveries(analysisEntry, recoveries) - errMsg := fmt.Sprintf("AttemptRecoveryRegistration: cluster %+v has recently experienced a failover (of %+v) and is in active period. It will not be failed over again. You may acknowledge the failure on this cluster (-c ack-cluster-recoveries) or on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.ClusterDetails.ClusterName, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) + errMsg := fmt.Sprintf("AttemptRecoveryRegistration: keyspace %+v shard %+v has recently experienced a failover (of %+v) and is in active period. It will not be failed over again. You may acknowledge the failure on this cluster (-c ack-cluster-recoveries) or on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) log.Errorf(errMsg) return nil, fmt.Errorf(errMsg) } @@ -246,7 +252,8 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking into blocked_topology_recovery ( hostname, port, - cluster_name, + keyspace, + shard, analysis, last_blocked_timestamp, blocking_recovery_id @@ -259,13 +266,15 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking ? ) on duplicate key update - cluster_name=values(cluster_name), + keyspace=values(keyspace), + shard=values(shard), analysis=values(analysis), last_blocked_timestamp=values(last_blocked_timestamp), blocking_recovery_id=values(blocking_recovery_id) `, analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, - analysisEntry.ClusterDetails.ClusterName, + analysisEntry.ClusterDetails.Keyspace, + analysisEntry.ClusterDetails.Shard, string(analysisEntry.Analysis), recovery.ID, ) @@ -444,7 +453,8 @@ func readRecoveries(whereCondition string, limit string, args []any) ([]*Topolog ifnull(successor_port, 0) as successor_port, ifnull(successor_alias, '') as successor_alias, analysis, - cluster_name, + keyspace, + shard, count_affected_replicas, participating_instances, lost_replicas, @@ -476,7 +486,8 @@ func readRecoveries(whereCondition string, limit string, args []any) ([]*Topolog topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) - topologyRecovery.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") + topologyRecovery.AnalysisEntry.ClusterDetails.Keyspace = m.GetString("keyspace") + topologyRecovery.AnalysisEntry.ClusterDetails.Shard = m.GetString("shard") topologyRecovery.AnalysisEntry.CountReplicas = m.GetUint("count_affected_replicas") topologyRecovery.SuccessorKey = &inst.InstanceKey{} @@ -509,12 +520,13 @@ func readRecoveries(whereCondition string, limit string, args []any) ([]*Topolog // ReadInActivePeriodClusterRecovery reads recoveries (possibly complete!) that are in active period. // (may be used to block further recoveries on this cluster) -func ReadInActivePeriodClusterRecovery(clusterName string) ([]*TopologyRecovery, error) { +func ReadInActivePeriodClusterRecovery(keyspace string, shard string) ([]*TopologyRecovery, error) { whereClause := ` where in_active_period=1 - and cluster_name=?` - return readRecoveries(whereClause, ``, sqlutils.Args(clusterName)) + and keyspace=? + and shard=?` + return readRecoveries(whereClause, ``, sqlutils.Args(keyspace, shard)) } // ReadInActivePeriodSuccessorInstanceRecovery reads completed recoveries for a given instance, where said instance @@ -529,17 +541,13 @@ func ReadInActivePeriodSuccessorInstanceRecovery(instanceKey *inst.InstanceKey) } // ReadRecentRecoveries reads latest recovery entries from topology_recovery -func ReadRecentRecoveries(clusterName string, unacknowledgedOnly bool, page int) ([]*TopologyRecovery, error) { +func ReadRecentRecoveries(unacknowledgedOnly bool, page int) ([]*TopologyRecovery, error) { whereConditions := []string{} whereClause := "" args := sqlutils.Args() if unacknowledgedOnly { whereConditions = append(whereConditions, `acknowledged=0`) } - if clusterName != "" { - whereConditions = append(whereConditions, `cluster_name=?`) - args = append(args, clusterName) - } if len(whereConditions) > 0 { whereClause = fmt.Sprintf("where %s", strings.Join(whereConditions, " and ")) } diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go new file mode 100644 index 00000000000..b977e9c4371 --- /dev/null +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -0,0 +1,70 @@ +/* +Copyright 2022 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logic + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/vt/vtorc/db" + "vitess.io/vitess/go/vt/vtorc/inst" +) + +// TestTopologyRecovery tests various operations related to topology recovery like reading from and writing it to the database. +func TestTopologyRecovery(t *testing.T) { + // Open the vtorc + // After the test completes delete everything from the vitess_tablet table + orcDb, err := db.OpenVTOrc() + require.NoError(t, err) + defer func() { + _, err = orcDb.Exec("delete from topology_recovery") + require.NoError(t, err) + }() + + replicationAnalysis := inst.ReplicationAnalysis{ + AnalyzedInstanceKey: inst.InstanceKey{ + Hostname: hostname, + Port: 101, + }, + TabletType: tab101.Type, + ClusterDetails: inst.ClusterInfo{ + Keyspace: keyspace, + Shard: shard, + }, + AnalyzedKeyspace: keyspace, + AnalyzedShard: shard, + Analysis: inst.ReplicaIsWritable, + IsReadOnly: false, + } + topologyRecovery := NewTopologyRecovery(replicationAnalysis) + + t.Run("writing to topology recovery", func(t *testing.T) { + topologyRecovery, err = writeTopologyRecovery(topologyRecovery) + require.NoError(t, err) + // The ID field should be populated after the insert + require.Greater(t, topologyRecovery.ID, int64(0)) + }) + + t.Run("read recoveries", func(t *testing.T) { + recoveries, err := ReadRecentRecoveries(false, 0) + require.NoError(t, err) + require.Len(t, recoveries, 1) + // Assert that the ID field matches the one that we just wrote + require.EqualValues(t, topologyRecovery.ID, recoveries[0].ID) + }) +} diff --git a/go/vt/vtorc/server/api.go b/go/vt/vtorc/server/api.go index ab97385ce4a..d488633f376 100644 --- a/go/vt/vtorc/server/api.go +++ b/go/vt/vtorc/server/api.go @@ -40,6 +40,8 @@ const ( enableGlobalRecoveriesAPI = "/api/enable-global-recoveries" replicationAnalysisAPI = "/api/replication-analysis" healthAPI = "/debug/health" + + shardWithoutKeyspaceFilteringErrorStr = "Filtering by shard without keyspace isn't supported" ) var ( @@ -118,18 +120,13 @@ func returnAsJSON(response http.ResponseWriter, code int, stuff any) { // problemsAPIHandler is the handler for the problemsAPI endpoint func problemsAPIHandler(response http.ResponseWriter, request *http.Request) { // This api also supports filtering by shard and keyspace provided. - // Currently, both of them have to be provided in order to filter the instances. - // Once we split the cluster_name field into keyspace and shard, we can support - // filtering just by keyspace as well. shard := request.URL.Query().Get("shard") keyspace := request.URL.Query().Get("keyspace") - clusterName := "" - // Override the cluster name to filter by only when both the parameters - // are specified and not empty - if keyspace != "" && shard != "" { - clusterName = inst.GetClusterNameFromKeyspaceAndShard(keyspace, shard) + if shard != "" && keyspace == "" { + http.Error(response, shardWithoutKeyspaceFilteringErrorStr, http.StatusBadRequest) + return } - instances, err := inst.ReadProblemInstances(clusterName) + instances, err := inst.ReadProblemInstances(keyspace, shard) if err != nil { http.Error(response, err.Error(), http.StatusInternalServerError) return @@ -160,18 +157,13 @@ func enableGlobalRecoveriesAPIHandler(response http.ResponseWriter) { // replicationAnalysisAPIHandler is the handler for the replicationAnalysisAPI endpoint func replicationAnalysisAPIHandler(response http.ResponseWriter, request *http.Request) { // This api also supports filtering by shard and keyspace provided. - // Currently, both of them have to be provided in order to filter the replication analysis. - // Once we split the cluster_name field into keyspace and shard, we can support - // filtering just by keyspace as well. shard := request.URL.Query().Get("shard") keyspace := request.URL.Query().Get("keyspace") - clusterName := "" - // Override the cluster name to filter by only when both the parameters - // are specified and not empty - if keyspace != "" && shard != "" { - clusterName = inst.GetClusterNameFromKeyspaceAndShard(keyspace, shard) + if shard != "" && keyspace == "" { + http.Error(response, shardWithoutKeyspaceFilteringErrorStr, http.StatusBadRequest) + return } - analysis, err := inst.GetReplicationAnalysis(clusterName, &inst.ReplicationAnalysisHints{}) + analysis, err := inst.GetReplicationAnalysis(keyspace, shard, &inst.ReplicationAnalysisHints{}) if err != nil { http.Error(response, err.Error(), http.StatusInternalServerError) return diff --git a/go/vt/vtorc/test/recovery_analysis.go b/go/vt/vtorc/test/recovery_analysis.go index f4cdcfd7db9..cf030d62ce7 100644 --- a/go/vt/vtorc/test/recovery_analysis.go +++ b/go/vt/vtorc/test/recovery_analysis.go @@ -48,8 +48,6 @@ type InfoForRecoveryAnalysis struct { LogFile string LogPos int64 IsStaleBinlogCoordinates int - ClusterName string - ClusterDomain string GTIDMode string LastCheckValid int LastCheckPartialSuccess int @@ -91,8 +89,6 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap { rowMap := make(sqlutils.RowMap) rowMap["binary_log_file"] = sqlutils.CellData{String: info.LogFile, Valid: true} rowMap["binary_log_pos"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LogPos), Valid: true} - rowMap["cluster_domain"] = sqlutils.CellData{String: info.ClusterDomain, Valid: true} - rowMap["cluster_name"] = sqlutils.CellData{String: info.ClusterName, Valid: true} rowMap["count_binlog_server_replicas"] = sqlutils.CellData{Valid: false} rowMap["count_co_primary_replicas"] = sqlutils.CellData{Valid: false} rowMap["count_delayed_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountDelayedReplicas), Valid: true} @@ -165,6 +161,4 @@ func (info *InfoForRecoveryAnalysis) SetValuesFromTabletInfo() { info.DataCenter = info.TabletInfo.Alias.Cell info.Keyspace = info.TabletInfo.Keyspace info.Shard = info.TabletInfo.Shard - info.ClusterName = fmt.Sprintf("%v:%v", info.TabletInfo.Keyspace, info.TabletInfo.Shard) - info.ClusterDomain = fmt.Sprintf("%v:%d", info.TabletInfo.MysqlHostname, info.TabletInfo.MysqlPort) } From a1ae5b31a2dc9687573a8f404fc5aaa28cb8b8fd Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Tue, 17 Jan 2023 09:13:48 +0530 Subject: [PATCH 5/9] Fix insert query of blocked_recovery table in VTOrc (#12091) * feat: add failing test and fix the query of insertion Signed-off-by: Manan Gupta * empty-commit Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/vt/vtorc/logic/topology_recovery_dao.go | 1 + .../vtorc/logic/topology_recovery_dao_test.go | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/go/vt/vtorc/logic/topology_recovery_dao.go b/go/vt/vtorc/logic/topology_recovery_dao.go index f2254fe542e..65df9c1ebed 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao.go +++ b/go/vt/vtorc/logic/topology_recovery_dao.go @@ -262,6 +262,7 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking ?, ?, ?, + ?, NOW(), ? ) diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go index b977e9c4371..a89c28e7572 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao_test.go +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -19,6 +19,7 @@ package logic import ( "testing" + "github.com/openark/golib/sqlutils" "github.com/stretchr/testify/require" "vitess.io/vitess/go/vt/vtorc/db" @@ -68,3 +69,39 @@ func TestTopologyRecovery(t *testing.T) { require.EqualValues(t, topologyRecovery.ID, recoveries[0].ID) }) } + +// TestBlockedRecoveryInsertion tests that we are able to insert into the blocked_recovery table. +func TestBlockedRecoveryInsertion(t *testing.T) { + orcDb, err := db.OpenVTOrc() + require.NoError(t, err) + defer func() { + _, err = orcDb.Exec("delete from blocked_topology_recovery") + require.NoError(t, err) + }() + + analysisEntry := &inst.ReplicationAnalysis{ + AnalyzedInstanceKey: inst.InstanceKey{ + Hostname: "localhost", + Port: 100, + }, + ClusterDetails: inst.ClusterInfo{ + Keyspace: "ks", + Shard: "0", + }, + Analysis: inst.DeadPrimaryAndSomeReplicas, + } + blockedRecovery := &TopologyRecovery{ + ID: 1, + } + err = RegisterBlockedRecoveries(analysisEntry, []*TopologyRecovery{blockedRecovery}) + require.NoError(t, err) + + totalBlockedRecoveries := 0 + err = db.QueryVTOrc("select count(*) as blocked_recoveries from blocked_topology_recovery", nil, func(rowMap sqlutils.RowMap) error { + totalBlockedRecoveries = rowMap.GetInt("blocked_recoveries") + return nil + }) + require.NoError(t, err) + // There should be 1 blocked recovery after insertion + require.Equal(t, 1, totalBlockedRecoveries) +} From 4ab6f7458ca0af7e6e6aa11f8948f9bdce7a7d4a Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Wed, 18 Jan 2023 11:06:58 +0530 Subject: [PATCH 6/9] Fix: VTOrc forgetting old instances (#12089) * test: add a failing test for the case where the port changes for a tablet Signed-off-by: Manan Gupta * feat: fix the issue by adding alias as a unique field Signed-off-by: Manan Gupta * empty-commit Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- go/vt/vtorc/db/generate_base.go | 2 ++ go/vt/vtorc/inst/analysis_dao_test.go | 8 ++--- go/vt/vtorc/inst/tablet_dao.go | 5 ++-- go/vt/vtorc/logic/tablet_discovery_test.go | 34 ++++++++++++++++++++++ 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/go/vt/vtorc/db/generate_base.go b/go/vt/vtorc/db/generate_base.go index 3dd3cc437e4..88b26ed2c27 100644 --- a/go/vt/vtorc/db/generate_base.go +++ b/go/vt/vtorc/db/generate_base.go @@ -754,6 +754,7 @@ DROP TABLE IF EXISTS vitess_tablet `, ` CREATE TABLE vitess_tablet ( + alias varchar(256) NOT NULL, hostname varchar(128) NOT NULL, port smallint NOT NULL, keyspace varchar(128) NOT NULL, @@ -762,6 +763,7 @@ CREATE TABLE vitess_tablet ( tablet_type smallint(5) NOT NULL, primary_timestamp timestamp NOT NULL, info varchar(512) NOT NULL, + UNIQUE (alias), PRIMARY KEY (hostname, port) )`, ` diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index ce293bd5051..480986e34ba 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -607,10 +607,10 @@ func TestGetReplicationAnalysis(t *testing.T) { `INSERT INTO database_instance VALUES('localhost',6711,'2022-12-28 07:26:04','2022-12-28 07:26:04',1094500338,'8.0.31','ROW',1,1,'vt-0000000100-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000100-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-acf8-d6b0ef9f4eaa','2022-12-28 07:26:04','',1,0,0,'zone1-0000000100','Homebrew','8.0','FULL',10103920,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-acf8-d6b0ef9f4eaa',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, `INSERT INTO database_instance VALUES('localhost',6714,'2022-12-28 07:26:04','2022-12-28 07:26:04',390954723,'8.0.31','ROW',1,1,'vt-0000000101-bin.000001',15583,'',0,0,0,'',0,'',0,NULL,NULL,0,'','',0,0,'',0,0,0,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a4cc4-8680-11ed-a104-47706090afbd','2022-12-28 07:26:04','',0,0,0,'zone1-0000000101','Homebrew','8.0','FULL',11366095,1,1,'ON',1,'','','729a4cc4-8680-11ed-a104-47706090afbd',-1,-1,'',1000000000000000000,1,1,0,2,'',0,'','','[]','',0);`, `INSERT INTO database_instance VALUES('localhost',6756,'2022-12-28 07:26:05','2022-12-28 07:26:05',444286571,'8.0.31','ROW',1,1,'vt-0000000200-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000200-relay-bin.000002',15815,0,1,0,'zone2','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a497c-8680-11ed-8ad4-3f51d747db75','2022-12-28 07:26:05','',1,0,0,'zone2-0000000200','Homebrew','8.0','FULL',10443112,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a497c-8680-11ed-8ad4-3f51d747db75',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, - `INSERT INTO vitess_tablet VALUES('localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000100','localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000101','localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000112','localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone2-0000000200','localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, `INSERT INTO vitess_keyspace VALUES('ks',0,'semi_sync');`, } diff --git a/go/vt/vtorc/inst/tablet_dao.go b/go/vt/vtorc/inst/tablet_dao.go index 0e258bf0882..cd762a6883e 100644 --- a/go/vt/vtorc/inst/tablet_dao.go +++ b/go/vt/vtorc/inst/tablet_dao.go @@ -180,11 +180,12 @@ func SaveTablet(tablet *topodatapb.Tablet) error { _, err = db.ExecVTOrc(` replace into vitess_tablet ( - hostname, port, cell, keyspace, shard, tablet_type, primary_timestamp, info + alias, hostname, port, cell, keyspace, shard, tablet_type, primary_timestamp, info ) values ( - ?, ?, ?, ?, ?, ?, ?, ? + ?, ?, ?, ?, ?, ?, ?, ?, ? ) `, + topoproto.TabletAliasString(tablet.Alias), tablet.MysqlHostname, int(tablet.MysqlPort), tablet.Alias.Cell, diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index 410f1a70e0a..67394b78efe 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -22,6 +22,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/openark/golib/sqlutils" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" @@ -175,6 +176,26 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) { // We expect 1 tablet to be refreshed since that is the only one that has changed verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets) }) + + t.Run("change the port and call refreshTabletsInKeyspaceShard again", func(t *testing.T) { + defer func() { + _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { + tablet.MysqlPort = 100 + return nil + }) + tab100.MysqlPort = 100 + }() + // Let's assume tab100 restarted on a different pod. This would change its tablet hostname and port + _, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error { + tablet.MysqlPort = 39293 + return nil + }) + require.NoError(t, err) + tab100.MysqlPort = 39293 + // We expect 1 tablet to be refreshed since that is the only one that has changed + // Also the old tablet should be forgotten + verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets) + }) } func TestShardPrimary(t *testing.T) { @@ -258,6 +279,7 @@ func verifyRefreshTabletsInKeyspaceShard(t *testing.T, forceRefresh bool, instan for _, tablet := range tablets { verifyTabletInfo(t, tablet, "") } + verifyTabletCount(t, len(tablets)) // Verify that refresh as many tablets as expected assert.EqualValues(t, instanceRefreshRequired, instancesRefreshed.Load()) } @@ -280,3 +302,15 @@ func verifyTabletInfo(t *testing.T, tabletWanted *topodatapb.Tablet, errString s assert.Empty(t, diff) } } + +// verifyTabletCount verifies that the number of tablets in the vitess_tablet table match the given count +func verifyTabletCount(t *testing.T, countWanted int) { + t.Helper() + totalTablets := 0 + err := db.QueryVTOrc("select count(*) as total_tablets from vitess_tablet", nil, func(rowMap sqlutils.RowMap) error { + totalTablets = rowMap.GetInt("total_tablets") + return nil + }) + require.NoError(t, err) + require.Equal(t, countWanted, totalTablets) +} From 96a756e8053d6929b0d285ec45fbc9fd9cbd4de1 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Thu, 2 Feb 2023 18:23:51 +0100 Subject: [PATCH 7/9] Move vtorc from go-sqlite3 to modernc.org/sqlite (#12214) * Move vtorc from go-sqlite3 to modernc.org/sqlite This moves vtorc from the go-sqlite3 library that uses CGO, to use modernc.org/sqlite which is a pure Go implementation. vtorc is the only component we have to build with CGO but it's causing pain for releases since we need to build it against an old Linux for linking against glibc. Using modernc.org/sqlite allows for using Go only again and makes all Vitess components buildable without CGO. In https://datastation.multiprocess.io/blog/2022-05-12-sqlite-in-go-with-and-without-cgo.html someone ran some basic benchmarks. It shows that the pure Go version can be twice as slow, but the usage of vtorc is very limited and we operate on small datasets, so I think the performance impact purely of a somewhat slower sqlite implementation is negligable. None of this is in a hot query serving path or anything like that, so I have little concern performance wise. Signed-off-by: Dirkjan Bussink * Fix error handling in RowToArray Signed-off-by: Dirkjan Bussink --------- Signed-off-by: Dirkjan Bussink --- go.mod | 4 ++-- go.sum | 7 ++++--- go/vt/vtorc/logic/tablet_discovery_test.go | 3 ++- go/vt/vtorc/logic/topology_recovery_dao_test.go | 3 ++- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index 3166c13c9c8..2e9d549853c 100644 --- a/go.mod +++ b/go.mod @@ -48,7 +48,7 @@ require ( github.com/klauspost/compress v1.13.0 github.com/klauspost/pgzip v1.2.4 github.com/krishicks/yaml-patch v0.0.10 - github.com/magiconair/properties v1.8.5 + github.com/magiconair/properties v1.8.6 github.com/mattn/go-sqlite3 v1.14.16 // indirect github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1 github.com/mitchellh/go-testing-interface v1.14.0 // indirect @@ -198,7 +198,7 @@ require ( modernc.org/ccgo/v3 v3.16.13 // indirect modernc.org/libc v1.22.2 // indirect modernc.org/mathutil v1.5.0 // indirect - modernc.org/memory v1.4.0 // indirect + modernc.org/memory v1.5.0 // indirect modernc.org/opt v0.1.3 // indirect modernc.org/strutil v1.1.3 // indirect modernc.org/token v1.0.1 // indirect diff --git a/go.sum b/go.sum index b9a439e07a5..f4f0f24414f 100644 --- a/go.sum +++ b/go.sum @@ -504,8 +504,9 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/krishicks/yaml-patch v0.0.10 h1:H4FcHpnNwVmw8u0MjPRjWyIXtco6zM2F78t+57oNM3E= github.com/krishicks/yaml-patch v0.0.10/go.mod h1:Sm5TchwZS6sm7RJoyg87tzxm2ZcKzdRE4Q7TjNhPrME= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= -github.com/magiconair/properties v1.8.5 h1:b6kJs+EmPFMYGkow9GiUyCyOvIwYetYJ3fSaWak/Gls= github.com/magiconair/properties v1.8.5/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60= +github.com/magiconair/properties v1.8.6 h1:5ibWZ6iY0NctNGWo87LalDlEZ6R41TqbbDamhfG/Qzo= +github.com/magiconair/properties v1.8.6/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60= github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190312143242-1de009706dbe/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= @@ -1297,8 +1298,8 @@ modernc.org/libc v1.22.2 h1:4U7v51GyhlWqQmwCHj28Rdq2Yzwk55ovjFrdPjs8Hb0= modernc.org/libc v1.22.2/go.mod h1:uvQavJ1pZ0hIoC/jfqNoMLURIMhKzINIWypNM17puug= modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= modernc.org/mathutil v1.5.0/go.mod h1:mZW8CKdRPY1v87qxC/wUdX5O1qDzXMP5TH3wjfpga6E= -modernc.org/memory v1.4.0 h1:crykUfNSnMAXaOJnnxcSzbUGMqkLWjklJKkBK2nwZwk= -modernc.org/memory v1.4.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU= +modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= +modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU= modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0= modernc.org/sqlite v1.20.3 h1:SqGJMMxjj1PHusLxdYxeQSodg7Jxn9WWkaAQjKrntZs= diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index 67394b78efe..d43cebefc0f 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -22,11 +22,12 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "github.com/openark/golib/sqlutils" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" + "vitess.io/vitess/go/vt/external/golib/sqlutils" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/proto/vttime" "vitess.io/vitess/go/vt/topo/memorytopo" diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go index a89c28e7572..f01e16560a8 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao_test.go +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -19,9 +19,10 @@ package logic import ( "testing" - "github.com/openark/golib/sqlutils" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/vt/external/golib/sqlutils" + "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" ) From 53a0e0ca85f07b3b85de9150b6fdbe323b9cb3d2 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 26 Mar 2024 23:49:20 +0100 Subject: [PATCH 8/9] see if CI passes on v14.0.5 as previous release Signed-off-by: Tim Vaillancourt --- tools/get_previous_release.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/get_previous_release.sh b/tools/get_previous_release.sh index 39cf1e58815..15195909bf4 100755 --- a/tools/get_previous_release.sh +++ b/tools/get_previous_release.sh @@ -25,4 +25,5 @@ # # I will hardcode the previous Slack release here because it is static. # -echo slack-vitess-r14.0.5 +#echo slack-vitess-r14.0.5 +echo v14.0.5 From e75b9e4b5699bf836df7505d75c73716cf66fd3d Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 16 Apr 2024 20:57:28 +0200 Subject: [PATCH 9/9] Revert "see if CI passes on v14.0.5 as previous release" This reverts commit 53a0e0ca85f07b3b85de9150b6fdbe323b9cb3d2. --- tools/get_previous_release.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/get_previous_release.sh b/tools/get_previous_release.sh index 15195909bf4..39cf1e58815 100755 --- a/tools/get_previous_release.sh +++ b/tools/get_previous_release.sh @@ -25,5 +25,4 @@ # # I will hardcode the previous Slack release here because it is static. # -#echo slack-vitess-r14.0.5 -echo v14.0.5 +echo slack-vitess-r14.0.5