Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTOrc: Cleanup node registration and unused code #15617

Merged
merged 3 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions go/test/endtoend/vtorc/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ func TestAPIEndpoints(t *testing.T) {
// Verify when VTOrc is healthy, it has also run the first discovery.
assert.Equal(t, 200, status)
assert.Contains(t, resp, `"Healthy": true,`)
assert.Contains(t, resp, `"DiscoveredOnce": true`)

// find primary from topo
primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
Expand Down Expand Up @@ -77,7 +76,6 @@ func TestAPIEndpoints(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, 200, status)
assert.Contains(t, resp, `"Healthy": true,`)
assert.Contains(t, resp, `"DiscoveredOnce": true`)
})

t.Run("Liveness API", func(t *testing.T) {
Expand Down
1 change: 0 additions & 1 deletion go/vt/vtctl/reparentutil/promotionrule/promotion_rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
)

// CandidatePromotionRule describe the promotion preference/rule for an instance.
// It maps to promotion_rule column in candidate_database_instance
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see we removed that table. Where does this map to now? Is it helpful to document that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing, we don't store promotion rules in the tables anymore. They're gotten from the interface implementation.

type CandidatePromotionRule string

const (
Expand Down
5 changes: 0 additions & 5 deletions go/vt/vtorc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,10 @@ import (
"vitess.io/vitess/go/vt/log"
)

const (
LostInRecoveryDowntimeSeconds int = 60 * 60 * 24 * 365
)

var configurationLoaded = make(chan bool)

const (
HealthPollSeconds = 1
ActiveNodeExpireSeconds = 5
AuditPageSize = 20
DebugMetricsIntervalSeconds = 10
StaleInstanceCoordinatesExpireSeconds = 60
Expand Down
63 changes: 1 addition & 62 deletions go/vt/vtorc/db/generate_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,12 @@ package db
var TableNames = []string{
"database_instance",
"audit",
"active_node",
"node_health",
"topology_recovery",
"database_instance_topology_history",
"candidate_database_instance",
"recovery_detection",
"database_instance_last_analysis",
"database_instance_analysis_changelog",
"node_health_history",
"vtorc_db_deployments",
"global_recovery_disable",
"topology_recovery_steps",
Expand Down Expand Up @@ -137,32 +134,11 @@ CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp)
CREATE INDEX alias_idx_audit ON audit (alias, audit_timestamp)
`,
`
DROP TABLE IF EXISTS active_node
`,
`
CREATE TABLE active_node (
anchor tinyint NOT NULL,
hostname varchar(128) NOT NULL,
token varchar(128) NOT NULL,
last_seen_active timestamp not null default (''),
first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00',
PRIMARY KEY (anchor)
)`,
`
DROP TABLE IF EXISTS node_health
`,
`
CREATE TABLE node_health (
hostname varchar(128) NOT NULL,
token varchar(128) NOT NULL,
last_seen_active timestamp not null default (''),
extra_info varchar(128) not null default '',
command varchar(128) not null default '',
app_version varchar(64) NOT NULL DEFAULT "",
first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00',
db_backend varchar(255) NOT NULL DEFAULT "",
incrementing_indicator bigint not null default 0,
PRIMARY KEY (hostname, token)
last_seen_active timestamp not null default ('')
)`,
`
DROP TABLE IF EXISTS topology_recovery
Expand Down Expand Up @@ -205,20 +181,6 @@ CREATE TABLE database_instance_topology_history (
CREATE INDEX keyspace_shard_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, keyspace, shard)
`,
`
DROP TABLE IF EXISTS candidate_database_instance
`,
`
CREATE TABLE candidate_database_instance (
alias varchar(256) NOT NULL,
last_suggested timestamp not null default (''),
priority TINYINT SIGNED NOT NULL DEFAULT 1,
promotion_rule text check(promotion_rule in ('must', 'prefer', 'neutral', 'prefer_not', 'must_not')) NOT NULL DEFAULT 'neutral',
PRIMARY KEY (alias)
)`,
`
CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested)
`,
`
DROP TABLE IF EXISTS recovery_detection
`,
`
Expand Down Expand Up @@ -259,26 +221,6 @@ CREATE TABLE database_instance_analysis_changelog (
CREATE INDEX analysis_timestamp_idx_database_instance_analysis_changelog ON database_instance_analysis_changelog (analysis_timestamp)
`,
`
DROP TABLE IF EXISTS node_health_history
`,
`
CREATE TABLE node_health_history (
history_id integer,
hostname varchar(128) NOT NULL,
token varchar(128) NOT NULL,
first_seen_active timestamp NOT NULL,
extra_info varchar(128) NOT NULL,
command varchar(128) not null default '',
app_version varchar(64) NOT NULL DEFAULT "",
PRIMARY KEY (history_id)
)`,
`
CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (first_seen_active)
`,
`
CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token)
`,
`
DROP TABLE IF EXISTS vtorc_db_deployments
`,
`
Expand Down Expand Up @@ -379,9 +321,6 @@ CREATE INDEX instance_timestamp_idx_database_instance_analysis_changelog on data
CREATE INDEX detection_idx_topology_recovery on topology_recovery (detection_id)
`,
`
CREATE INDEX last_seen_active_idx_node_health on node_health (last_seen_active)
`,
`
CREATE INDEX recovery_id_idx_topology_recovery_steps ON topology_recovery_steps(recovery_id)
`,
}
28 changes: 0 additions & 28 deletions go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@ const (
AllPrimaryReplicasNotReplicatingOrDead AnalysisCode = "AllPrimaryReplicasNotReplicatingOrDead"
LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis"
LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary"
PrimaryWithoutReplicas AnalysisCode = "PrimaryWithoutReplicas"
BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary"
GraceFulPrimaryTakeover AnalysisCode = "GracefulPrimaryTakeover"
ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected"
)

Expand All @@ -83,34 +81,20 @@ type ReplicationAnalysisHints struct {
AuditAnalysis bool
}

type AnalysisInstanceType string

const (
AnalysisInstanceTypePrimary AnalysisInstanceType = "primary"
AnalysisInstanceTypeCoPrimary AnalysisInstanceType = "co-primary"
AnalysisInstanceTypeIntermediatePrimary AnalysisInstanceType = "intermediate-primary"
)

// ReplicationAnalysis notes analysis on replication chain status, per instance
type ReplicationAnalysis struct {
AnalyzedInstanceHostname string
AnalyzedInstancePort int
AnalyzedInstanceAlias string
AnalyzedInstancePrimaryAlias string
TabletType topodatapb.TabletType
PrimaryTimeStamp time.Time
ClusterDetails ClusterInfo
AnalyzedInstanceDataCenter string
AnalyzedInstanceRegion string
AnalyzedKeyspace string
AnalyzedShard string
// ShardPrimaryTermTimestamp is the primary term start time stored in the shard record.
ShardPrimaryTermTimestamp string
AnalyzedInstancePhysicalEnvironment string
AnalyzedInstanceBinlogCoordinates BinlogCoordinates
IsPrimary bool
IsClusterPrimary bool
IsCoPrimary bool
LastCheckValid bool
LastCheckPartialSuccess bool
CountReplicas uint
Expand Down Expand Up @@ -159,18 +143,6 @@ func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) {
return json.Marshal(i)
}

// Get a string description of the analyzed instance type (primary? co-primary? intermediate-primary?)
func (replicationAnalysis *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType {
if replicationAnalysis.IsCoPrimary {
return AnalysisInstanceTypeCoPrimary
}

if replicationAnalysis.IsPrimary {
return AnalysisInstanceTypePrimary
}
return AnalysisInstanceTypeIntermediatePrimary
}

// ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time
// between last_attempted_check to last_checked before we consider the instance as invalid.
func ValidSecondsFromSeenToLastAttemptedCheck() uint {
Expand Down
15 changes: 0 additions & 15 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
query := `
SELECT
vitess_tablet.info AS tablet_info,
vitess_tablet.hostname,
vitess_tablet.port,
vitess_tablet.tablet_type,
vitess_tablet.primary_timestamp,
vitess_tablet.shard AS shard,
Expand All @@ -87,9 +85,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
primary_instance.read_only AS read_only,
MIN(primary_instance.gtid_errant) AS gtid_errant,
MIN(primary_instance.alias) IS NULL AS is_invalid,
MIN(primary_instance.data_center) AS data_center,
Copy link
Member

@deepthi deepthi Apr 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why we don't need to store this any more? It's hard to see when you are reviewing a diff where we store the cell info which essentially provides the same functionality.

Copy link
Member Author

@GuptaManan100 GuptaManan100 Apr 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still store it, we just don't need to read it at this point, because it was unused. ERS code will read it anyways when it gets all the tablets from the shard.

MIN(primary_instance.region) AS region,
MIN(primary_instance.physical_environment) AS physical_environment,
MIN(primary_instance.binary_log_file) AS binary_log_file,
MIN(primary_instance.binary_log_pos) AS binary_log_pos,
MIN(primary_tablet.info) AS primary_tablet_info,
Expand All @@ -115,7 +110,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
OR substr(primary_instance.source_host, 1, 2) = '//'
)
) AS is_primary,
MIN(primary_instance.is_co_primary) AS is_co_primary,
MIN(primary_instance.gtid_mode) AS gtid_mode,
COUNT(replica_instance.server_id) AS count_replicas,
IFNULL(
Expand Down Expand Up @@ -172,7 +166,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
MIN(
primary_instance.semi_sync_replica_enabled
) AS semi_sync_replica_enabled,
SUM(replica_instance.is_co_primary) AS count_co_primary_replicas,
SUM(replica_instance.oracle_gtid) AS count_oracle_gtid_replicas,
IFNULL(
SUM(
Expand Down Expand Up @@ -331,15 +324,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna

a.ShardPrimaryTermTimestamp = m.GetString("shard_primary_term_timestamp")
a.IsPrimary = m.GetBool("is_primary")
countCoPrimaryReplicas := m.GetUint("count_co_primary_replicas")
a.IsCoPrimary = m.GetBool("is_co_primary") || (countCoPrimaryReplicas > 0)
a.AnalyzedInstanceHostname = m.GetString("hostname")
a.AnalyzedInstancePort = m.GetInt("port")
a.AnalyzedInstanceAlias = topoproto.TabletAliasString(tablet.Alias)
a.AnalyzedInstancePrimaryAlias = topoproto.TabletAliasString(primaryTablet.Alias)
a.AnalyzedInstanceDataCenter = m.GetString("data_center")
a.AnalyzedInstanceRegion = m.GetString("region")
a.AnalyzedInstancePhysicalEnvironment = m.GetString("physical_environment")
a.AnalyzedInstanceBinlogCoordinates = BinlogCoordinates{
LogFile: m.GetString("binary_log_file"),
LogPos: m.GetUint32("binary_log_pos"),
Expand All @@ -359,7 +345,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
a.IsFailingToConnectToPrimary = m.GetBool("is_failing_to_connect_to_primary")
a.ReplicationStopped = m.GetBool("replication_stopped")
a.IsBinlogServer = m.GetBool("is_binlog_server")
a.ClusterDetails.ReadRecoveryInfo()
a.ErrantGTID = m.GetString("gtid_errant")

countValidOracleGTIDReplicas := m.GetUint("count_valid_oracle_gtid_replicas")
Expand Down
14 changes: 6 additions & 8 deletions go/vt/vtorc/inst/analysis_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -917,21 +917,19 @@ func TestAuditInstanceAnalysisInChangelog(t *testing.T) {
// TestPostProcessAnalyses tests the functionality of the postProcessAnalyses function.
func TestPostProcessAnalyses(t *testing.T) {
ks0 := ClusterInfo{
Keyspace: "ks",
Shard: "0",
CountInstances: 4,
Keyspace: "ks",
Shard: "0",
}
ks80 := ClusterInfo{
Keyspace: "ks",
Shard: "80-",
CountInstances: 3,
Keyspace: "ks",
Shard: "80-",
}
clusters := map[string]*clusterAnalysis{
getKeyspaceShardName(ks0.Keyspace, ks0.Shard): {
totalTablets: int(ks0.CountInstances),
totalTablets: 4,
},
getKeyspaceShardName(ks80.Keyspace, ks80.Shard): {
totalTablets: int(ks80.CountInstances),
totalTablets: 3,
},
}

Expand Down
48 changes: 0 additions & 48 deletions go/vt/vtorc/inst/analysis_test.go

This file was deleted.

14 changes: 2 additions & 12 deletions go/vt/vtorc/inst/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,6 @@ package inst

// ClusterInfo makes for a cluster status/info summary
type ClusterInfo struct {
Keyspace string
Shard string
CountInstances uint
HeuristicLag int64
HasAutomatedPrimaryRecovery bool
HasAutomatedIntermediatePrimaryRecovery bool
}

// ReadRecoveryInfo
func (clusterInfo *ClusterInfo) ReadRecoveryInfo() {
clusterInfo.HasAutomatedPrimaryRecovery = true
clusterInfo.HasAutomatedIntermediatePrimaryRecovery = true
Keyspace string
Shard string
}
3 changes: 0 additions & 3 deletions go/vt/vtorc/logic/tablet_discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,6 @@ func refreshAllTablets() {
}

func refreshTabletsUsing(loader func(tabletAlias string), forceRefresh bool) {
if !IsLeaderOrActive() {
return
}
if len(clustersToWatch) == 0 { // all known clusters
ctx, cancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout)
defer cancel()
Expand Down
10 changes: 4 additions & 6 deletions go/vt/vtorc/logic/tablet_discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,14 +353,12 @@ func TestProcessHealth(t *testing.T) {
process.FirstDiscoveryCycleComplete.Store(false)
}()
// Verify in the beginning, we have the first DiscoveredOnce field false.
health, err := process.HealthTest()
require.NoError(t, err)
require.False(t, health.DiscoveredOnce)
_, discoveredOnce := process.HealthTest()
require.False(t, discoveredOnce)
ts = memorytopo.NewServer(context.Background(), cell1)
populateAllInformation()
require.True(t, process.FirstDiscoveryCycleComplete.Load())
// Verify after we populate all information, we have the first DiscoveredOnce field true.
health, err = process.HealthTest()
require.NoError(t, err)
require.True(t, health.DiscoveredOnce)
_, discoveredOnce = process.HealthTest()
require.True(t, discoveredOnce)
}
Loading
Loading