diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 16f1df2b856..ebb39210572 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -115,20 +115,20 @@ func TestAPIEndpoints(t *testing.T) { return response == "[]" }) assert.Equal(t, 200, status, resp) - assert.Contains(t, resp, fmt.Sprintf(`"Port": %d`, replica.MySQLPort)) + assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias)) assert.Contains(t, resp, `"Analysis": "ReplicationStopped"`) // Verify that filtering also works in the API as intended status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks&shard=0") require.NoError(t, err) assert.Equal(t, 200, status, resp) - assert.Contains(t, resp, fmt.Sprintf(`"Port": %d`, replica.MySQLPort)) + assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias)) // Verify that filtering by keyspace also works in the API as intended status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks") require.NoError(t, err) assert.Equal(t, 200, status, resp) - assert.Contains(t, resp, fmt.Sprintf(`"Port": %d`, replica.MySQLPort)) + assert.Contains(t, resp, fmt.Sprintf(`"AnalyzedInstanceAlias": "%s"`, replica.Alias)) // Check that filtering using keyspace and shard works status, resp, err = utils.MakeAPICall(t, vtorc, "/api/replication-analysis?keyspace=ks&shard=80-") diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index fe8b53f5103..e9756ad5d2b 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -71,12 +71,11 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { } } - primaryInstance, err := inst.ReadTopologyInstanceBufferable(&inst.InstanceKey{ - Hostname: utils.Hostname, - Port: primary.MySQLPort, - }, nil) + primaryInstance, err := inst.ReadTopologyInstanceBufferable(primary.Alias, nil) require.NoError(t, err) require.NotNil(t, primaryInstance) + assert.Equal(t, utils.Hostname, primaryInstance.Hostname) + assert.Equal(t, primary.MySQLPort, primaryInstance.Port) assert.Contains(t, primaryInstance.InstanceAlias, "zone1") assert.NotEqual(t, 0, primaryInstance.ServerID) assert.Greater(t, len(primaryInstance.ServerUUID), 10) @@ -120,12 +119,11 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { err = logic.EnableRecovery() require.NoError(t, err) - replicaInstance, err := inst.ReadTopologyInstanceBufferable(&inst.InstanceKey{ - Hostname: utils.Hostname, - Port: replica.MySQLPort, - }, nil) + replicaInstance, err := inst.ReadTopologyInstanceBufferable(replica.Alias, nil) require.NoError(t, err) require.NotNil(t, replicaInstance) + assert.Equal(t, utils.Hostname, replicaInstance.Hostname) + assert.Equal(t, replica.MySQLPort, replicaInstance.Port) assert.Contains(t, replicaInstance.InstanceAlias, "zone1") assert.NotEqual(t, 0, replicaInstance.ServerID) assert.Greater(t, len(replicaInstance.ServerUUID), 10) @@ -137,6 +135,8 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.Equal(t, "ROW", replicaInstance.BinlogFormat) assert.Equal(t, "ON", replicaInstance.GTIDMode) assert.Equal(t, "FULL", replicaInstance.BinlogRowImage) + assert.Equal(t, utils.Hostname, replicaInstance.SourceHost) + assert.Equal(t, primary.MySQLPort, replicaInstance.SourcePort) assert.Contains(t, replicaInstance.SelfBinlogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-bin", replica.TabletUID)) assert.Greater(t, replicaInstance.SelfBinlogCoordinates.LogPos, uint32(0)) assert.False(t, replicaInstance.SemiSyncPrimaryEnabled) diff --git a/go/vt/topotools/tablet.go b/go/vt/topotools/tablet.go index ef458ae7cd3..8bbca4b8c03 100644 --- a/go/vt/topotools/tablet.go +++ b/go/vt/topotools/tablet.go @@ -245,3 +245,29 @@ func TabletIdent(tablet *topodatapb.Tablet) string { func TargetIdent(target *querypb.Target) string { return fmt.Sprintf("%s/%s (%s)", target.Keyspace, target.Shard, target.TabletType) } + +// TabletEquality returns true iff two Tablets are identical for testing purposes +func TabletEquality(left, right *topodatapb.Tablet) bool { + if left.Keyspace != right.Keyspace { + return false + } + if left.Shard != right.Shard { + return false + } + if left.Hostname != right.Hostname { + return false + } + if left.Type != right.Type { + return false + } + if left.MysqlHostname != right.MysqlHostname { + return false + } + if left.MysqlPort != right.MysqlPort { + return false + } + if left.PrimaryTermStartTime.String() != right.PrimaryTermStartTime.String() { + return false + } + return topoproto.TabletAliasString(left.Alias) == topoproto.TabletAliasString(right.Alias) +} diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 73e55378bec..00d58f6dd89 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -46,9 +46,7 @@ const ( DiscoveryQueueCapacity = 100000 DiscoveryQueueMaxStatisticsSize = 120 DiscoveryCollectionRetentionSeconds = 120 - HostnameResolveMethod = "default" UnseenInstanceForgetHours = 240 // Number of hours after which an unseen instance is forgotten - ExpiryHostnameResolvesMinutes = 60 // Number of minutes after which to expire hostname-resolves CandidateInstanceExpireMinutes = 60 // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on primary failover) is expired. FailureDetectionPeriodBlockMinutes = 60 // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any. ) diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go index 04150339c5c..aca3b248c26 100644 --- a/go/vt/vtorc/db/db.go +++ b/go/vt/vtorc/db/db.go @@ -72,7 +72,7 @@ func translateStatement(statement string) string { return sqlutils.ToSqlite3Dialect(statement) } -// registerVTOrcDeployment updates the vtorc_metadata table upon successful deployment +// registerVTOrcDeployment updates the vtorc_db_deployments table upon successful deployment func registerVTOrcDeployment(db *sql.DB) error { query := ` replace into vtorc_db_deployments ( @@ -82,7 +82,7 @@ func registerVTOrcDeployment(db *sql.DB) error { ) ` if _, err := execInternal(db, query, ""); err != nil { - log.Fatalf("Unable to write to vtorc_metadata: %+v", err) + log.Fatalf("Unable to write to vtorc_db_deployments: %+v", err) } return nil } @@ -188,15 +188,3 @@ func QueryVTOrc(query string, argsArray []any, onRow func(sqlutils.RowMap) error return err } - -// ReadTimeNow reads and returns the current timestamp as string. This is an unfortunate workaround -// to support both MySQL and SQLite in all possible timezones. SQLite only speaks UTC where MySQL has -// timezone support. By reading the time as string we get the database's de-facto notion of the time, -// which we can then feed back to it. -func ReadTimeNow() (timeNow string, err error) { - err = QueryVTOrc(`select now() as time_now`, nil, func(m sqlutils.RowMap) error { - timeNow = m.GetString("time_now") - return nil - }) - return timeNow, err -} diff --git a/go/vt/vtorc/db/generate_base.go b/go/vt/vtorc/db/generate_base.go index 88b26ed2c27..3d2f209e10f 100644 --- a/go/vt/vtorc/db/generate_base.go +++ b/go/vt/vtorc/db/generate_base.go @@ -23,6 +23,7 @@ DROP TABLE IF EXISTS database_instance `, ` CREATE TABLE database_instance ( + alias varchar(256) NOT NULL, hostname varchar(128) NOT NULL, port smallint NOT NULL, last_checked timestamp not null default (''), @@ -67,7 +68,6 @@ CREATE TABLE database_instance ( has_replication_credentials TINYint not null default 0, allow_tls TINYint not null default 0, semi_sync_enforced TINYint not null default 0, - instance_alias varchar(128) not null default '', version_comment varchar(128) NOT NULL DEFAULT '', major_version varchar(16) not null default '', binlog_row_image varchar(16) not null default '', @@ -87,14 +87,7 @@ CREATE TABLE database_instance ( semi_sync_primary_status TINYint NOT NULL DEFAULT 0, semi_sync_replica_status TINYint NOT NULL DEFAULT 0, semi_sync_primary_clients int NOT NULL DEFAULT 0, - replication_group_name VARCHAR(64) NOT NULL DEFAULT '', - replication_group_is_single_primary_mode TINYint NOT NULL DEFAULT 1, - replication_group_member_state VARCHAR(16) NOT NULL DEFAULT '', - replication_group_member_role VARCHAR(16) NOT NULL DEFAULT '', - replication_group_members text not null default '', - replication_group_primary_host varchar(128) NOT NULL DEFAULT '', - replication_group_primary_port smallint NOT NULL DEFAULT 0, - PRIMARY KEY (hostname,port) + PRIMARY KEY (alias) )`, ` CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checked) @@ -103,48 +96,6 @@ CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checke CREATE INDEX last_seen_idx_database_instance ON database_instance(last_seen) `, ` -DROP TABLE IF EXISTS database_instance_maintenance -`, - ` -CREATE TABLE database_instance_maintenance ( - database_instance_maintenance_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - maintenance_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) NOT NULL, - reason text NOT NULL, - processing_node_hostname varchar(128) not null default '', - processing_node_token varchar(128) not null default '', - explicitly_bounded TINYint not null default 0, - PRIMARY KEY (database_instance_maintenance_id) -)`, - ` -CREATE UNIQUE INDEX maintenance_uidx_database_instance_maintenance ON database_instance_maintenance (maintenance_active, hostname, port) - `, - ` -DROP TABLE IF EXISTS database_instance_long_running_queries -`, - ` -CREATE TABLE database_instance_long_running_queries ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - process_id bigint(20) NOT NULL, - process_started_at timestamp not null default (''), - process_user varchar(16) NOT NULL, - process_host varchar(128) NOT NULL, - process_db varchar(128) NOT NULL, - process_command varchar(16) NOT NULL, - process_time_seconds int(11) NOT NULL, - process_state varchar(128) NOT NULL, - process_info varchar(1024) NOT NULL, - PRIMARY KEY (hostname,port,process_id) -)`, - ` -CREATE INDEX process_started_at_idx_database_instance_long_running_queries ON database_instance_long_running_queries (process_started_at) - `, - ` DROP TABLE IF EXISTS audit `, ` @@ -152,8 +103,7 @@ CREATE TABLE audit ( audit_id integer, audit_timestamp timestamp not null default (''), audit_type varchar(128) NOT NULL, - hostname varchar(128) NOT NULL DEFAULT '', - port smallint NOT NULL, + alias varchar(256) NOT NULL, message text NOT NULL, keyspace varchar(128) NOT NULL, shard varchar(128) NOT NULL, @@ -163,91 +113,7 @@ CREATE TABLE audit ( CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp) `, ` -CREATE INDEX host_port_idx_audit ON audit (hostname, port, audit_timestamp) - `, - ` -DROP TABLE IF EXISTS host_agent -`, - ` -CREATE TABLE host_agent ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - token varchar(128) NOT NULL, - last_submitted timestamp not null default (''), - last_checked timestamp NULL DEFAULT NULL, - last_seen timestamp NULL DEFAULT NULL, - mysql_port smallint DEFAULT NULL, - count_mysql_snapshots smallint NOT NULL, - PRIMARY KEY (hostname) -)`, - ` -CREATE INDEX token_idx_host_agent ON host_agent (token) - `, - ` -CREATE INDEX last_submitted_idx_host_agent ON host_agent (last_submitted) - `, - ` -CREATE INDEX last_checked_idx_host_agent ON host_agent (last_checked) - `, - ` -CREATE INDEX last_seen_idx_host_agent ON host_agent (last_seen) - `, - ` -DROP TABLE IF EXISTS agent_seed -`, - ` -CREATE TABLE agent_seed ( - agent_seed_id integer, - target_hostname varchar(128) NOT NULL, - source_hostname varchar(128) NOT NULL, - start_timestamp timestamp not null default (''), - end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - is_complete tinyint NOT NULL DEFAULT '0', - is_successful tinyint NOT NULL DEFAULT '0', - PRIMARY KEY (agent_seed_id) -)`, - ` -CREATE INDEX target_hostname_idx_agent_seed ON agent_seed (target_hostname,is_complete) - `, - ` -CREATE INDEX source_hostname_idx_agent_seed ON agent_seed (source_hostname,is_complete) - `, - ` -CREATE INDEX start_timestamp_idx_agent_seed ON agent_seed (start_timestamp) - `, - ` -CREATE INDEX is_complete_idx_agent_seed ON agent_seed (is_complete,start_timestamp) - `, - ` -CREATE INDEX is_successful_idx_agent_seed ON agent_seed (is_successful, start_timestamp) - `, - ` -DROP TABLE IF EXISTS agent_seed_state -`, - ` -CREATE TABLE agent_seed_state ( - agent_seed_state_id integer, - agent_seed_id int NOT NULL, - state_timestamp timestamp not null default (''), - state_action varchar(127) NOT NULL, - error_message varchar(255) NOT NULL, - PRIMARY KEY (agent_seed_state_id) -)`, - ` -CREATE INDEX agent_seed_idx_agent_seed_state ON agent_seed_state (agent_seed_id, state_timestamp) - `, - ` -DROP TABLE IF EXISTS hostname_resolve -`, - ` -CREATE TABLE hostname_resolve ( - hostname varchar(128) NOT NULL, - resolved_hostname varchar(128) NOT NULL, - resolved_timestamp timestamp not null default (''), - PRIMARY KEY (hostname) -)`, - ` -CREATE INDEX resolved_timestamp_idx_hostname_resolve ON hostname_resolve (resolved_timestamp) +CREATE INDEX alias_idx_audit ON audit (alias, audit_timestamp) `, ` DROP TABLE IF EXISTS active_node @@ -283,16 +149,14 @@ DROP TABLE IF EXISTS topology_recovery ` CREATE TABLE topology_recovery ( recovery_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, in_active_period tinyint NOT NULL DEFAULT 0, start_active_period timestamp not null default (''), end_active_period_unixtime int, end_recovery timestamp NULL DEFAULT NULL, processing_node_hostname varchar(128) NOT NULL, processcing_node_token varchar(128) NOT NULL, - successor_hostname varchar(128) DEFAULT NULL, - successor_port smallint DEFAULT NULL, + successor_alias varchar(256) DEFAULT NULL, analysis varchar(128) not null default '', keyspace varchar(128) NOT NULL, shard varchar(128) NOT NULL, @@ -301,12 +165,9 @@ CREATE TABLE topology_recovery ( acknowledged TINYint NOT NULL DEFAULT 0, acknowledged_by varchar(128) not null default '', acknowledge_comment text not null default '', - participating_instances text not null default '', - lost_replicas text not null default '', all_errors text not null default '', acknowledged_at TIMESTAMP NULL, last_detection_id bigint not null default 0, - successor_alias varchar(128) DEFAULT NULL, uid varchar(128) not null default '', PRIMARY KEY (recovery_id) )`, @@ -317,20 +178,7 @@ CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery ( CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period) `, ` -CREATE UNIQUE INDEX hostname_port_active_period_uidx_topology_recovery ON topology_recovery (hostname, port, in_active_period, end_active_period_unixtime) - `, - ` -DROP TABLE IF EXISTS hostname_unresolve -`, - ` -CREATE TABLE hostname_unresolve ( - hostname varchar(128) NOT NULL, - unresolved_hostname varchar(128) NOT NULL, - last_registered timestamp not null default (''), - PRIMARY KEY (hostname) -)`, - ` -CREATE INDEX unresolved_hostname_idx_hostname_unresolve ON hostname_unresolve (unresolved_hostname) +CREATE UNIQUE INDEX alias_active_period_uidx_topology_recovery ON topology_recovery (alias, in_active_period, end_active_period_unixtime) `, ` DROP TABLE IF EXISTS database_instance_topology_history @@ -338,6 +186,7 @@ DROP TABLE IF EXISTS database_instance_topology_history ` CREATE TABLE database_instance_topology_history ( snapshot_unix_timestamp int NOT NULL, + alias varchar(256) NOT NULL, hostname varchar(128) NOT NULL, port smallint NOT NULL, source_host varchar(128) NOT NULL, @@ -345,7 +194,7 @@ CREATE TABLE database_instance_topology_history ( keyspace varchar(128) NOT NULL, shard varchar(128) NOT NULL, version varchar(128) not null default '', - PRIMARY KEY (snapshot_unix_timestamp, hostname, port) + PRIMARY KEY (snapshot_unix_timestamp, alias) )`, ` CREATE INDEX keyspace_shard_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, keyspace, shard) @@ -355,38 +204,22 @@ DROP TABLE IF EXISTS candidate_database_instance `, ` CREATE TABLE candidate_database_instance ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, last_suggested timestamp not null default (''), priority TINYINT SIGNED NOT NULL DEFAULT 1, promotion_rule text check(promotion_rule in ('must', 'prefer', 'neutral', 'prefer_not', 'must_not')) NOT NULL DEFAULT 'neutral', - PRIMARY KEY (hostname, port) + PRIMARY KEY (alias) )`, ` CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested) `, ` -DROP TABLE IF EXISTS database_instance_downtime -`, - ` -CREATE TABLE database_instance_downtime ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - downtime_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp default (''), - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) NOT NULL, - reason text NOT NULL, - PRIMARY KEY (hostname, port) -)`, - ` DROP TABLE IF EXISTS topology_failure_detection `, ` CREATE TABLE topology_failure_detection ( detection_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, in_active_period tinyint NOT NULL DEFAULT '0', start_active_period timestamp not null default (''), end_active_period_unixtime int NOT NULL, @@ -403,100 +236,17 @@ CREATE TABLE topology_failure_detection ( CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period) `, ` -DROP TABLE IF EXISTS hostname_resolve_history -`, - ` -CREATE TABLE hostname_resolve_history ( - resolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - resolved_timestamp timestamp not null default (''), - PRIMARY KEY (resolved_hostname) -)`, - ` -CREATE INDEX hostname_idx_hostname_resolve_history ON hostname_resolve_history (hostname) - `, - ` -CREATE INDEX resolved_timestamp_idx_hostname_resolve_history ON hostname_resolve_history (resolved_timestamp) - `, - ` -DROP TABLE IF EXISTS hostname_unresolve_history -`, - ` -CREATE TABLE hostname_unresolve_history ( - unresolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - last_registered timestamp not null default (''), - PRIMARY KEY (unresolved_hostname) -)`, - ` -CREATE INDEX hostname_idx_hostname_unresolve_history ON hostname_unresolve_history (hostname) - `, - ` -CREATE INDEX last_registered_idx_hostname_unresolve_history ON hostname_unresolve_history (last_registered) - `, - ` -DROP TABLE IF EXISTS primary_position_equivalence -`, - ` -CREATE TABLE primary_position_equivalence ( - equivalence_id integer, - primary1_hostname varchar(128) NOT NULL, - primary1_port smallint NOT NULL, - primary1_binary_log_file varchar(128) NOT NULL, - primary1_binary_log_pos bigint NOT NULL, - primary2_hostname varchar(128) NOT NULL, - primary2_port smallint NOT NULL, - primary2_binary_log_file varchar(128) NOT NULL, - primary2_binary_log_pos bigint NOT NULL, - last_suggested timestamp not null default (''), - PRIMARY KEY (equivalence_id) -)`, - ` -CREATE UNIQUE INDEX equivalence_uidx_primary_position_equivalence ON primary_position_equivalence (primary1_hostname, primary1_port, primary1_binary_log_file, primary1_binary_log_pos, primary2_hostname, primary2_port) - `, - ` -CREATE INDEX primary2_idx_primary_position_equivalence ON primary_position_equivalence (primary2_hostname, primary2_port, primary2_binary_log_file, primary2_binary_log_pos) - `, - ` -CREATE INDEX last_suggested_idx_primary_position_equivalence ON primary_position_equivalence (last_suggested) - `, - ` -DROP TABLE IF EXISTS async_request -`, - ` -CREATE TABLE async_request ( - request_id integer, - command varchar(128) not null, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - destination_hostname varchar(128) NOT NULL, - destination_port smallint NOT NULL, - pattern text NOT NULL, - gtid_hint varchar(32) not null, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - story text NOT NULL, - PRIMARY KEY (request_id) -)`, - ` -CREATE INDEX begin_timestamp_idx_async_request ON async_request (begin_timestamp) - `, - ` -CREATE INDEX end_timestamp_idx_async_request ON async_request (end_timestamp) - `, - ` DROP TABLE IF EXISTS blocked_topology_recovery `, ` CREATE TABLE blocked_topology_recovery ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, keyspace varchar(128) NOT NULL, shard varchar(128) NOT NULL, analysis varchar(128) NOT NULL, last_blocked_timestamp timestamp not null default (''), blocking_recovery_id bigint, - PRIMARY KEY (hostname, port) + PRIMARY KEY (alias) )`, ` CREATE INDEX keyspace_shard_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (keyspace, shard, last_blocked_timestamp) @@ -506,11 +256,10 @@ DROP TABLE IF EXISTS database_instance_last_analysis `, ` CREATE TABLE database_instance_last_analysis ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, analysis_timestamp timestamp not null default (''), analysis varchar(128) NOT NULL, - PRIMARY KEY (hostname, port) + PRIMARY KEY (alias) )`, ` CREATE INDEX analysis_timestamp_idx_database_instance_last_analysis ON database_instance_last_analysis (analysis_timestamp) @@ -521,8 +270,7 @@ DROP TABLE IF EXISTS database_instance_analysis_changelog ` CREATE TABLE database_instance_analysis_changelog ( changelog_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, analysis_timestamp timestamp not null default (''), analysis varchar(128) NOT NULL, PRIMARY KEY (changelog_id) @@ -551,76 +299,6 @@ CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (f CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token) `, ` -DROP TABLE IF EXISTS database_instance_coordinates_history -`, - ` -CREATE TABLE database_instance_coordinates_history ( - history_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - recorded_timestamp timestamp not null default (''), - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint NOT NULL, - relay_log_file varchar(128) NOT NULL, - relay_log_pos bigint NOT NULL, - last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (history_id) -)`, - ` -CREATE INDEX hostname_port_recorded_idx_database_instance_coordinates_history ON database_instance_coordinates_history (hostname, port, recorded_timestamp) - `, - ` -CREATE INDEX recorded_timestmp_idx_database_instance_coordinates_history ON database_instance_coordinates_history (recorded_timestamp) - `, - ` -DROP TABLE IF EXISTS database_instance_binlog_files_history -`, - ` -CREATE TABLE database_instance_binlog_files_history ( - history_id integer, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint NOT NULL, - first_seen timestamp not null default (''), - last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (history_id) -)`, - ` -CREATE UNIQUE INDEX hostname_port_file_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (hostname, port, binary_log_file) - `, - ` -CREATE INDEX last_seen_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (last_seen) - `, - ` -DROP TABLE IF EXISTS database_instance_recent_relaylog_history -`, - ` -CREATE TABLE database_instance_recent_relaylog_history ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - current_relay_log_file varchar(128) NOT NULL, - current_relay_log_pos bigint NOT NULL, - current_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - prev_relay_log_file varchar(128) NOT NULL, - prev_relay_log_pos bigint NOT NULL, - prev_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (hostname, port) -)`, - ` -CREATE INDEX current_seen_idx_database_instance_recent_relaylog_history ON database_instance_recent_relaylog_history (current_seen) - `, - ` -DROP TABLE IF EXISTS vtorc_metadata -`, - ` -CREATE TABLE vtorc_metadata ( - anchor tinyint NOT NULL, - last_deployed_version varchar(128) NOT NULL, - last_deployed_timestamp timestamp NOT NULL, - PRIMARY KEY (anchor) -)`, - ` DROP TABLE IF EXISTS vtorc_db_deployments `, ` @@ -649,102 +327,15 @@ CREATE TABLE topology_recovery_steps ( PRIMARY KEY (recovery_step_id) )`, ` -DROP TABLE IF EXISTS raft_store -`, - ` -CREATE TABLE raft_store ( - store_id integer, - store_key varbinary(512) not null, - store_value blob not null, - PRIMARY KEY (store_id) -)`, - ` -CREATE INDEX store_key_idx_raft_store ON raft_store (store_key) - `, - ` -DROP TABLE IF EXISTS raft_log -`, - ` -CREATE TABLE raft_log ( - log_index integer, - term bigint not null, - log_type int not null, - data blob not null, - PRIMARY KEY (log_index) -)`, - ` -DROP TABLE IF EXISTS raft_snapshot -`, - ` -CREATE TABLE raft_snapshot ( - snapshot_id integer, - snapshot_name varchar(128) NOT NULL, - snapshot_meta varchar(4096) NOT NULL, - created_at timestamp not null default (''), - PRIMARY KEY (snapshot_id) -)`, - ` -CREATE UNIQUE INDEX snapshot_name_uidx_raft_snapshot ON raft_snapshot (snapshot_name) - `, - ` -DROP TABLE IF EXISTS database_instance_peer_analysis -`, - ` -CREATE TABLE database_instance_peer_analysis ( - peer varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - analysis_timestamp timestamp not null default (''), - analysis varchar(128) NOT NULL, - PRIMARY KEY (peer, hostname, port) -)`, - ` -DROP TABLE IF EXISTS database_instance_tls -`, - ` -CREATE TABLE database_instance_tls ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - required tinyint NOT NULL DEFAULT 0, - PRIMARY KEY (hostname,port) -)`, - ` -DROP TABLE IF EXISTS hostname_ips -`, - ` -CREATE TABLE hostname_ips ( - hostname varchar(128) NOT NULL, - ipv4 varchar(128) NOT NULL, - ipv6 varchar(128) NOT NULL, - last_updated timestamp not null default (''), - PRIMARY KEY (hostname) -)`, - ` -DROP TABLE IF EXISTS database_instance_tags -`, - ` -CREATE TABLE database_instance_tags ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, - tag_name varchar(128) NOT NULL, - tag_value varchar(128) NOT NULL, - last_updated timestamp not null default (''), - PRIMARY KEY (hostname, port, tag_name) -)`, - ` -CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name) - `, - ` DROP TABLE IF EXISTS database_instance_stale_binlog_coordinates `, ` CREATE TABLE database_instance_stale_binlog_coordinates ( - hostname varchar(128) NOT NULL, - port smallint NOT NULL, + alias varchar(256) NOT NULL, binary_log_file varchar(128) NOT NULL, binary_log_pos bigint NOT NULL, first_seen timestamp not null default (''), - PRIMARY KEY (hostname, port) + PRIMARY KEY (alias) )`, ` CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen) @@ -763,8 +354,7 @@ CREATE TABLE vitess_tablet ( tablet_type smallint(5) NOT NULL, primary_timestamp timestamp NOT NULL, info varchar(512) NOT NULL, - UNIQUE (alias), - PRIMARY KEY (hostname, port) + PRIMARY KEY (alias) )`, ` CREATE INDEX cell_idx_vitess_tablet ON vitess_tablet (cell) @@ -786,15 +376,6 @@ CREATE TABLE vitess_keyspace ( CREATE INDEX source_host_port_idx_database_instance_database_instance on database_instance (source_host, source_port) `, ` -CREATE INDEX active_timestamp_idx_database_instance_maintenance on database_instance_maintenance (maintenance_active, begin_timestamp) - `, - ` -CREATE INDEX active_end_timestamp_idx_database_instance_maintenance on database_instance_maintenance (maintenance_active, end_timestamp) - `, - ` -CREATE INDEX last_registered_idx_hostname_unresolve on hostname_unresolve (last_registered) - `, - ` CREATE INDEX keyspace_shard_in_active_idx_topology_recovery on topology_recovery (keyspace, shard, in_active_period) `, ` @@ -807,7 +388,7 @@ CREATE INDEX acknowledged_idx_topology_recovery on topology_recovery (acknowledg CREATE INDEX last_blocked_idx_blocked_topology_recovery on blocked_topology_recovery (last_blocked_timestamp) `, ` -CREATE INDEX instance_timestamp_idx_database_instance_analysis_changelog on database_instance_analysis_changelog (hostname, port, analysis_timestamp) +CREATE INDEX instance_timestamp_idx_database_instance_analysis_changelog on database_instance_analysis_changelog (alias, analysis_timestamp) `, ` CREATE INDEX last_detection_idx_topology_recovery on topology_recovery (last_detection_id) @@ -822,9 +403,6 @@ CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid) CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid) `, ` -CREATE INDEX end_timestamp_idx_database_instance_downtime ON database_instance_downtime(end_timestamp) - `, - ` -CREATE UNIQUE INDEX host_port_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (hostname, port, in_active_period, end_active_period_unixtime, is_actionable) +CREATE UNIQUE INDEX alias_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (alias, in_active_period, end_active_period_unixtime, is_actionable) `, } diff --git a/go/vt/vtorc/discovery/aggregated.go b/go/vt/vtorc/discovery/aggregated.go index 67b3ff336b2..37d965fa51c 100644 --- a/go/vt/vtorc/discovery/aggregated.go +++ b/go/vt/vtorc/discovery/aggregated.go @@ -121,18 +121,18 @@ func aggregate(results []collection.Metric) AggregatedDiscoveryMetrics { // different names x := names[InstanceKeys] - x[v.InstanceKey.String()] = 1 // Value doesn't matter + x[v.TabletAlias] = 1 // Value doesn't matter names[InstanceKeys] = x if v.Err == nil { // ok names x := names[OkInstanceKeys] - x[v.InstanceKey.String()] = 1 // Value doesn't matter + x[v.TabletAlias] = 1 // Value doesn't matter names[OkInstanceKeys] = x } else { // failed names x := names[FailedInstanceKeys] - x[v.InstanceKey.String()] = 1 // Value doesn't matter + x[v.TabletAlias] = 1 // Value doesn't matter names[FailedInstanceKeys] = x } diff --git a/go/vt/vtorc/discovery/metric.go b/go/vt/vtorc/discovery/metric.go index 58afaa781ef..c322739502d 100644 --- a/go/vt/vtorc/discovery/metric.go +++ b/go/vt/vtorc/discovery/metric.go @@ -20,19 +20,17 @@ package discovery import ( "time" - - "vitess.io/vitess/go/vt/vtorc/inst" ) // Metric holds a set of information of instance discovery metrics type Metric struct { - Timestamp time.Time // time the collection was taken - InstanceKey inst.InstanceKey // instance being monitored - BackendLatency time.Duration // time taken talking to the backend - InstanceLatency time.Duration // time taken talking to the instance - TotalLatency time.Duration // total time taken doing the discovery - Err error // error (if applicable) doing the discovery process - InstancePollSecondsDurationCount uint64 // total numbers of times discoverInstance exceeded InstancePollSeconds + Timestamp time.Time // time the collection was taken + TabletAlias string // instance being monitored + BackendLatency time.Duration // time taken talking to the backend + InstanceLatency time.Duration // time taken talking to the instance + TotalLatency time.Duration // total time taken doing the discovery + Err error // error (if applicable) doing the discovery process + InstancePollSecondsDurationCount uint64 // total numbers of times discoverInstance exceeded InstancePollSeconds } // When did the metric happen diff --git a/go/vt/vtorc/discovery/metric_json.go b/go/vt/vtorc/discovery/metric_json.go deleted file mode 100644 index eb204f28043..00000000000 --- a/go/vt/vtorc/discovery/metric_json.go +++ /dev/null @@ -1,74 +0,0 @@ -/* - Copyright 2017 Simon J Mudd - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package discovery - -// Collect discovery metrics and manage their storage and retrieval for monitoring purposes. - -import ( - "errors" - "fmt" - "time" - - "vitess.io/vitess/go/vt/vtorc/collection" -) - -// formattedFloat is to force the JSON output to show 3 decimal places -type formattedFloat float64 - -func (m formattedFloat) String() string { - return fmt.Sprintf("%.3f", m) -} - -// MetricJSON holds a structure which represents some discovery latency information -type MetricJSON struct { - Timestamp time.Time - Hostname string - Port int - BackendLatencySeconds formattedFloat - InstanceLatencySeconds formattedFloat - TotalLatencySeconds formattedFloat - Err error -} - -// JSONSince returns an API response of discovery metric collection information -// in a printable JSON format. -func JSONSince(c *collection.Collection, t time.Time) ([](MetricJSON), error) { - if c == nil { - return nil, errors.New("MetricCollection.JSONSince: c == nil") - } - raw, err := c.Since(t) - if err != nil { - return nil, err - } - - // build up JSON response for each Metric we received - var s []MetricJSON - for i := range raw { - m := raw[i].(*Metric) // convert back to a real Metric rather than collection.Metric interface - mj := MetricJSON{ - Timestamp: m.Timestamp, - Hostname: m.InstanceKey.Hostname, - Port: m.InstanceKey.Port, - BackendLatencySeconds: formattedFloat(m.BackendLatency.Seconds()), - InstanceLatencySeconds: formattedFloat(m.InstanceLatency.Seconds()), - TotalLatencySeconds: formattedFloat(m.TotalLatency.Seconds()), - Err: m.Err, - } - s = append(s, mj) - } - return s, nil -} diff --git a/go/vt/vtorc/discovery/queue.go b/go/vt/vtorc/discovery/queue.go index 50d5c276e4e..1a0fd761300 100644 --- a/go/vt/vtorc/discovery/queue.go +++ b/go/vt/vtorc/discovery/queue.go @@ -31,7 +31,6 @@ import ( "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/inst" ) // QueueMetric contains the queue's active and queued sizes @@ -46,9 +45,9 @@ type Queue struct { name string done chan struct{} - queue chan inst.InstanceKey - queuedKeys map[inst.InstanceKey]time.Time - consumedKeys map[inst.InstanceKey]time.Time + queue chan string + queuedKeys map[string]time.Time + consumedKeys map[string]time.Time metrics []QueueMetric } @@ -62,13 +61,6 @@ func init() { discoveryQueue = make(map[string](*Queue)) } -// StopMonitoring stops monitoring all the queues -func StopMonitoring() { - for _, q := range discoveryQueue { - q.stopMonitoring() - } -} - // CreateOrReturnQueue allows for creation of a new discovery queue or // returning a pointer to an existing one given the name. func CreateOrReturnQueue(name string) *Queue { @@ -80,9 +72,9 @@ func CreateOrReturnQueue(name string) *Queue { q := &Queue{ name: name, - queuedKeys: make(map[inst.InstanceKey]time.Time), - consumedKeys: make(map[inst.InstanceKey]time.Time), - queue: make(chan inst.InstanceKey, config.DiscoveryQueueCapacity), + queuedKeys: make(map[string]time.Time), + consumedKeys: make(map[string]time.Time), + queue: make(chan string, config.DiscoveryQueueCapacity), } go q.startMonitoring() @@ -134,7 +126,7 @@ func (q *Queue) QueueLen() int { // Push enqueues a key if it is not on a queue and is not being // processed; silently returns otherwise. -func (q *Queue) Push(key inst.InstanceKey) { +func (q *Queue) Push(key string) { q.Lock() defer q.Unlock() @@ -154,7 +146,7 @@ func (q *Queue) Push(key inst.InstanceKey) { // Consume fetches a key to process; blocks if queue is empty. // Release must be called once after Consume. -func (q *Queue) Consume() inst.InstanceKey { +func (q *Queue) Consume() string { q.Lock() queue := q.queue q.Unlock() @@ -179,7 +171,7 @@ func (q *Queue) Consume() inst.InstanceKey { // Release removes a key from a list of being processed keys // which allows that key to be pushed into the queue again. -func (q *Queue) Release(key inst.InstanceKey) { +func (q *Queue) Release(key string) { q.Lock() defer q.Unlock() diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 07291578e0f..777a02fb3c5 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -18,7 +18,6 @@ package inst import ( "encoding/json" - "fmt" "strings" "time" @@ -72,38 +71,14 @@ const ( NotEnoughValidSemiSyncReplicasStructureWarning StructureAnalysisCode = "NotEnoughValidSemiSyncReplicasStructureWarning" ) -type InstanceAnalysis struct { - key *InstanceKey - analysis AnalysisCode -} - -func NewInstanceAnalysis(instanceKey *InstanceKey, analysis AnalysisCode) *InstanceAnalysis { - return &InstanceAnalysis{ - key: instanceKey, - analysis: analysis, - } -} - -func (instanceAnalysis *InstanceAnalysis) String() string { - return fmt.Sprintf("%s/%s", instanceAnalysis.key.StringCode(), string(instanceAnalysis.analysis)) -} - // PeerAnalysisMap indicates the number of peers agreeing on an analysis. // Key of this map is a InstanceAnalysis.String() type PeerAnalysisMap map[string]int type ReplicationAnalysisHints struct { - IncludeDowntimed bool - IncludeNoProblem bool - AuditAnalysis bool + AuditAnalysis bool } -const ( - ForcePrimaryFailoverCommandHint string = "force-primary-failover" - ForcePrimaryTakeoverCommandHint string = "force-primary-takeover" - GracefulPrimaryTakeoverCommandHint string = "graceful-primary-takeover" -) - type AnalysisInstanceType string const ( @@ -114,9 +89,10 @@ const ( // ReplicationAnalysis notes analysis on replication chain status, per instance type ReplicationAnalysis struct { - AnalyzedInstanceKey InstanceKey - AnalyzedInstanceAlias *topodatapb.TabletAlias - AnalyzedInstancePrimaryKey InstanceKey + AnalyzedInstanceHostname string + AnalyzedInstancePort int + AnalyzedInstanceAlias string + AnalyzedInstancePrimaryAlias string TabletType topodatapb.TabletType PrimaryTimeStamp time.Time ClusterDetails ClusterInfo @@ -135,17 +111,12 @@ type ReplicationAnalysis struct { CountValidReplicas uint CountValidReplicatingReplicas uint CountReplicasFailingToConnectToPrimary uint - CountDowntimedReplicas uint ReplicationDepth uint IsFailingToConnectToPrimary bool ReplicationStopped bool Analysis AnalysisCode Description string StructureAnalysis []StructureAnalysisCode - IsDowntimed bool - IsReplicasDowntimed bool // as good as downtimed because all replicas are downtimed AND analysis is all about the replicas (e.e. AllPrimaryReplicasNotReplicating) - DowntimeEndTimestamp string - DowntimeRemainingSeconds int IsBinlogServer bool OracleGTIDImmediateTopology bool MariaDBGTIDImmediateTopology bool @@ -166,24 +137,14 @@ type ReplicationAnalysis struct { IsActionableRecovery bool ProcessingNodeHostname string ProcessingNodeToken string - CountAdditionalAgreeingNodes int StartActivePeriod string - SkippableDueToDowntime bool GTIDMode string MinReplicaGTIDMode string MaxReplicaGTIDMode string MaxReplicaGTIDErrant string - CommandHint string IsReadOnly bool } -type AnalysisMap map[string](*ReplicationAnalysis) - -type ReplicationAnalysisChangelog struct { - AnalyzedInstanceKey InstanceKey - Changelog []string -} - func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) { i := struct { ReplicationAnalysis diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index c2252287b1c..0a88bc9095d 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -22,6 +22,7 @@ import ( "vitess.io/vitess/go/vt/external/golib/sqlutils" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/topo/topoproto" "google.golang.org/protobuf/encoding/prototext" @@ -37,13 +38,11 @@ import ( "github.com/rcrowley/go-metrics" ) -var analysisChangeWriteAttemptCounter = metrics.NewCounter() var analysisChangeWriteCounter = metrics.NewCounter() var recentInstantAnalysis *cache.Cache func init() { - _ = metrics.Register("analysis.change.write.attempt", analysisChangeWriteAttemptCounter) _ = metrics.Register("analysis.change.write", analysisChangeWriteCounter) go initializeAnalysisDaoPostConfiguration() @@ -57,7 +56,7 @@ func initializeAnalysisDaoPostConfiguration() { type clusterAnalysis struct { hasClusterwideAction bool - primaryKey *InstanceKey + primaryAlias string durability reparentutil.Durabler } @@ -79,12 +78,10 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna vitess_keyspace.keyspace_type AS keyspace_type, vitess_keyspace.durability_policy AS durability_policy, primary_instance.read_only AS read_only, - MIN(primary_instance.hostname) IS NULL AS is_invalid, + MIN(primary_instance.alias) IS NULL AS is_invalid, MIN(primary_instance.data_center) AS data_center, MIN(primary_instance.region) AS region, MIN(primary_instance.physical_environment) AS physical_environment, - MIN(primary_instance.source_host) AS source_host, - MIN(primary_instance.source_port) AS source_port, MIN(primary_instance.binary_log_file) AS binary_log_file, MIN(primary_instance.binary_log_pos) AS binary_log_pos, MIN(primary_tablet.info) AS primary_tablet_info, @@ -109,10 +106,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna OR primary_instance.source_port = 0 OR substr(primary_instance.source_host, 1, 2) = '//' ) - AND ( - primary_instance.replication_group_name = '' - OR primary_instance.replication_group_member_role = 'PRIMARY' - ) ) AS is_primary, MIN(primary_instance.is_co_primary) AS is_co_primary, MIN(primary_instance.gtid_mode) AS gtid_mode, @@ -150,19 +143,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna primary_instance.replica_sql_running = 0 OR primary_instance.replica_io_running = 0 ) AS replication_stopped, - MIN( - primary_downtime.downtime_active is not null - and ifnull(primary_downtime.end_timestamp, now()) > now() - ) AS is_downtimed, - MIN( - IFNULL(primary_downtime.end_timestamp, '') - ) AS downtime_end_timestamp, - MIN( - IFNULL( - unix_timestamp() - unix_timestamp(primary_downtime.end_timestamp), - 0 - ) - ) AS downtime_remaining_seconds, MIN( primary_instance.binlog_server ) AS is_binlog_server, @@ -267,18 +247,10 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna IFNULL(MAX(replica_instance.gtid_mode), '') AS max_replica_gtid_mode, IFNULL( MAX( - case when replica_downtime.downtime_active is not null - and ifnull(replica_downtime.end_timestamp, now()) > now() then '' else replica_instance.gtid_errant end + replica_instance.gtid_errant ), '' ) AS max_replica_gtid_errant, - IFNULL( - SUM( - replica_downtime.downtime_active is not null - and ifnull(replica_downtime.end_timestamp, now()) > now() - ), - 0 - ) AS count_downtimed_replicas, COUNT( DISTINCT case when replica_instance.log_bin AND replica_instance.log_replica_updates then replica_instance.major_version else NULL end @@ -289,49 +261,24 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna vitess_tablet.keyspace = vitess_keyspace.keyspace ) LEFT JOIN database_instance primary_instance ON ( - vitess_tablet.hostname = primary_instance.hostname - AND vitess_tablet.port = primary_instance.port + vitess_tablet.alias = primary_instance.alias ) LEFT JOIN vitess_tablet primary_tablet ON ( primary_tablet.hostname = primary_instance.source_host AND primary_tablet.port = primary_instance.source_port ) - LEFT JOIN hostname_resolve ON ( - primary_instance.hostname = hostname_resolve.hostname - ) LEFT JOIN database_instance replica_instance ON ( - COALESCE( - hostname_resolve.resolved_hostname, - primary_instance.hostname - ) = replica_instance.source_host + primary_instance.hostname = replica_instance.source_host AND primary_instance.port = replica_instance.source_port ) - LEFT JOIN database_instance_maintenance ON ( - primary_instance.hostname = database_instance_maintenance.hostname - AND primary_instance.port = database_instance_maintenance.port - AND database_instance_maintenance.maintenance_active = 1 - ) LEFT JOIN database_instance_stale_binlog_coordinates ON ( - primary_instance.hostname = database_instance_stale_binlog_coordinates.hostname - AND primary_instance.port = database_instance_stale_binlog_coordinates.port - ) - LEFT JOIN database_instance_downtime as primary_downtime ON ( - primary_instance.hostname = primary_downtime.hostname - AND primary_instance.port = primary_downtime.port - AND primary_downtime.downtime_active = 1 - ) - LEFT JOIN database_instance_downtime as replica_downtime ON ( - replica_instance.hostname = replica_downtime.hostname - AND replica_instance.port = replica_downtime.port - AND replica_downtime.downtime_active = 1 + vitess_tablet.alias = database_instance_stale_binlog_coordinates.alias ) WHERE - database_instance_maintenance.database_instance_maintenance_id IS NULL - AND ? IN ('', vitess_keyspace.keyspace) + ? IN ('', vitess_keyspace.keyspace) AND ? IN ('', vitess_tablet.shard) GROUP BY - vitess_tablet.hostname, - vitess_tablet.port + vitess_tablet.alias ORDER BY vitess_tablet.tablet_type ASC, vitess_tablet.primary_timestamp DESC @@ -373,9 +320,10 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.IsPrimary = m.GetBool("is_primary") countCoPrimaryReplicas := m.GetUint("count_co_primary_replicas") a.IsCoPrimary = m.GetBool("is_co_primary") || (countCoPrimaryReplicas > 0) - a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} - a.AnalyzedInstanceAlias = tablet.Alias - a.AnalyzedInstancePrimaryKey = InstanceKey{Hostname: m.GetString("source_host"), Port: m.GetInt("source_port")} + a.AnalyzedInstanceHostname = m.GetString("hostname") + a.AnalyzedInstancePort = m.GetInt("port") + a.AnalyzedInstanceAlias = topoproto.TabletAliasString(tablet.Alias) + a.AnalyzedInstancePrimaryAlias = topoproto.TabletAliasString(primaryTablet.Alias) a.AnalyzedInstanceDataCenter = m.GetString("data_center") a.AnalyzedInstanceRegion = m.GetString("region") a.AnalyzedInstancePhysicalEnvironment = m.GetString("physical_environment") @@ -394,13 +342,9 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.CountValidReplicas = m.GetUint("count_valid_replicas") a.CountValidReplicatingReplicas = m.GetUint("count_valid_replicating_replicas") a.CountReplicasFailingToConnectToPrimary = m.GetUint("count_replicas_failing_to_connect_to_primary") - a.CountDowntimedReplicas = m.GetUint("count_downtimed_replicas") a.ReplicationDepth = m.GetUint("replication_depth") a.IsFailingToConnectToPrimary = m.GetBool("is_failing_to_connect_to_primary") a.ReplicationStopped = m.GetBool("replication_stopped") - a.IsDowntimed = m.GetBool("is_downtimed") - a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") - a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds") a.IsBinlogServer = m.GetBool("is_binlog_server") a.ClusterDetails.ReadRecoveryInfo() @@ -446,7 +390,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna clusters[keyspaceShard] = &clusterAnalysis{} if a.TabletType == topodatapb.TabletType_PRIMARY { a.IsClusterPrimary = true - clusters[keyspaceShard].primaryKey = &a.AnalyzedInstanceKey + clusters[keyspaceShard].primaryAlias = a.AnalyzedInstanceAlias } durabilityPolicy := m.GetString("durability_policy") if durabilityPolicy == "" { @@ -503,15 +447,15 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.Analysis = PrimaryIsReadOnly a.Description = "Primary is read-only" // - } else if a.IsClusterPrimary && SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled { + } else if a.IsClusterPrimary && reparentutil.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled { a.Analysis = PrimarySemiSyncMustBeSet a.Description = "Primary semi-sync must be set" // - } else if a.IsClusterPrimary && SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled { + } else if a.IsClusterPrimary && reparentutil.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled { a.Analysis = PrimarySemiSyncMustNotBeSet a.Description = "Primary semi-sync must not be set" // - } else if topo.IsReplicaType(a.TabletType) && ca.primaryKey == nil { + } else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" { a.Analysis = ClusterHasNoPrimary a.Description = "Cluster has no primary" ca.hasClusterwideAction = true @@ -523,7 +467,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.Analysis = NotConnectedToPrimary a.Description = "Not connected to the primary" // - } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryKey != nil && a.AnalyzedInstancePrimaryKey != *ca.primaryKey { + } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias { a.Analysis = ConnectedToWrongPrimary a.Description = "Connected to wrong primary" // @@ -531,11 +475,11 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.Analysis = ReplicationStopped a.Description = "Replication is stopped" // - } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled { + } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && reparentutil.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled { a.Analysis = ReplicaSemiSyncMustBeSet a.Description = "Replica semi-sync must be set" // - } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled { + } else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !reparentutil.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled { a.Analysis = ReplicaSemiSyncMustNotBeSet a.Description = "Replica semi-sync must not be set" // @@ -589,22 +533,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna // } appendAnalysis := func(analysis *ReplicationAnalysis) { - if a.Analysis == NoProblem && len(a.StructureAnalysis) == 0 && !hints.IncludeNoProblem { - return - } - if a.IsDowntimed { - a.SkippableDueToDowntime = true - } - if a.CountReplicas == a.CountDowntimedReplicas { - switch a.Analysis { - case AllPrimaryReplicasNotReplicating, - AllPrimaryReplicasNotReplicatingOrDead, - PrimarySingleReplicaDead: - a.IsReplicasDowntimed = true - a.SkippableDueToDowntime = true - } - } - if a.SkippableDueToDowntime && !hints.IncludeDowntimed { + if a.Analysis == NoProblem && len(a.StructureAnalysis) == 0 { return } result = append(result, a) @@ -655,7 +584,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna if a.CountReplicas > 0 && hints.AuditAnalysis { // Interesting enough for analysis go func() { - _ = auditInstanceAnalysisInChangelog(&a.AnalyzedInstanceKey, a.Analysis) + _ = auditInstanceAnalysisInChangelog(a.AnalyzedInstanceAlias, a.Analysis) }() } return nil @@ -671,20 +600,17 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna // auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. // To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to // analysis codes are written. -func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { - if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { +func auditInstanceAnalysisInChangelog(tabletAlias string, analysisCode AnalysisCode) error { + if lastWrittenAnalysis, found := recentInstantAnalysis.Get(tabletAlias); found { if lastWrittenAnalysis == analysisCode { // Surely nothing new. // And let's expand the timeout - recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) + recentInstantAnalysis.Set(tabletAlias, analysisCode, cache.DefaultExpiration) return nil } } - // Passed the cache; but does database agree that there's a change? Here's a persistent cache; this comes here - // to verify no two vtorc services are doing this without coordinating (namely, one dies, the other taking its place - // and has no familiarity of the former's cache) - analysisChangeWriteAttemptCounter.Inc(1) + // Find if the lastAnalysisHasChanged or not while updating the row if it has. lastAnalysisChanged := false { sqlResult, err := db.ExecVTOrc(` @@ -692,11 +618,10 @@ func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode Ana analysis = ?, analysis_timestamp = now() where - hostname = ? - and port = ? + alias = ? and analysis != ? `, - string(analysisCode), instanceKey.Hostname, instanceKey.Port, string(analysisCode), + string(analysisCode), tabletAlias, string(analysisCode), ) if err != nil { log.Error(err) @@ -707,36 +632,48 @@ func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode Ana log.Error(err) return err } - lastAnalysisChanged = (rows > 0) + lastAnalysisChanged = rows > 0 } + + // If the last analysis has not changed, then there is a chance that this is the first insertion. + // We need to find that out too when we insert into the database. + firstInsertion := false if !lastAnalysisChanged { - _, err := db.ExecVTOrc(` + // The insert only returns more than 1 row changed if this is the first insertion. + sqlResult, err := db.ExecVTOrc(` insert ignore into database_instance_last_analysis ( - hostname, port, analysis_timestamp, analysis + alias, analysis_timestamp, analysis ) values ( - ?, ?, now(), ? + ?, now(), ? ) `, - instanceKey.Hostname, instanceKey.Port, string(analysisCode), + tabletAlias, string(analysisCode), ) if err != nil { log.Error(err) return err } + rows, err := sqlResult.RowsAffected() + if err != nil { + log.Error(err) + return err + } + firstInsertion = rows > 0 } - recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) - if !lastAnalysisChanged { + recentInstantAnalysis.Set(tabletAlias, analysisCode, cache.DefaultExpiration) + // If the analysis has changed or if it is the first insertion, we need to make sure we write this change to the database. + if !lastAnalysisChanged && !firstInsertion { return nil } _, err := db.ExecVTOrc(` insert into database_instance_analysis_changelog ( - hostname, port, analysis_timestamp, analysis + alias, analysis_timestamp, analysis ) values ( - ?, ?, now(), ? + ?, now(), ? ) `, - instanceKey.Hostname, instanceKey.Port, string(analysisCode), + tabletAlias, string(analysisCode), ) if err == nil { analysisChangeWriteCounter.Inc(1) diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go index 480986e34ba..8bce5049ca8 100644 --- a/go/vt/vtorc/inst/analysis_dao_test.go +++ b/go/vt/vtorc/inst/analysis_dao_test.go @@ -18,7 +18,10 @@ package inst import ( "testing" + "time" + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" "github.com/stretchr/testify/require" "vitess.io/vitess/go/vt/external/golib/sqlutils" @@ -28,6 +31,22 @@ import ( "vitess.io/vitess/go/vt/vtorc/test" ) +var ( + // The initialSQL is a set of insert commands copied from a dump of an actual running VTOrc instances. The relevant insert commands are here. + // This is a dump taken from a test running 4 tablets, zone1-101 is the primary, zone1-100 is a replica, zone1-112 is a rdonly and zone2-200 is a cross-cell replica. + initialSQL = []string{ + `INSERT INTO database_instance VALUES('zone1-0000000112','localhost',6747,'2022-12-28 07:26:04','2022-12-28 07:26:04',213696377,'8.0.31','ROW',1,1,'vt-0000000112-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000112-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-9240-92a06c3be3c2','2022-12-28 07:26:04','',1,0,0,'Homebrew','8.0','FULL',10816929,0,0,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-9240-92a06c3be3c2',1,1,'',1000000000000000000,1,0,0,0);`, + `INSERT INTO database_instance VALUES('zone1-0000000100','localhost',6711,'2022-12-28 07:26:04','2022-12-28 07:26:04',1094500338,'8.0.31','ROW',1,1,'vt-0000000100-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000100-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-acf8-d6b0ef9f4eaa','2022-12-28 07:26:04','',1,0,0,'Homebrew','8.0','FULL',10103920,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-acf8-d6b0ef9f4eaa',1,1,'',1000000000000000000,1,0,1,0);`, + `INSERT INTO database_instance VALUES('zone1-0000000101','localhost',6714,'2022-12-28 07:26:04','2022-12-28 07:26:04',390954723,'8.0.31','ROW',1,1,'vt-0000000101-bin.000001',15583,'',0,0,0,'',0,'',0,NULL,NULL,0,'','',0,0,'',0,0,0,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a4cc4-8680-11ed-a104-47706090afbd','2022-12-28 07:26:04','',0,0,0,'Homebrew','8.0','FULL',11366095,1,1,'ON',1,'','','729a4cc4-8680-11ed-a104-47706090afbd',-1,-1,'',1000000000000000000,1,1,0,2);`, + `INSERT INTO database_instance VALUES('zone2-0000000200','localhost',6756,'2022-12-28 07:26:05','2022-12-28 07:26:05',444286571,'8.0.31','ROW',1,1,'vt-0000000200-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000200-relay-bin.000002',15815,0,1,0,'zone2','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a497c-8680-11ed-8ad4-3f51d747db75','2022-12-28 07:26:05','',1,0,0,'Homebrew','8.0','FULL',10443112,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a497c-8680-11ed-8ad4-3f51d747db75',1,1,'',1000000000000000000,1,0,1,0);`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000100','localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000101','localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone1-0000000112','localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_tablet VALUES('zone2-0000000200','localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, + `INSERT INTO vitess_keyspace VALUES('ks',0,'semi_sync');`, + } +) + // TestGetReplicationAnalysisDecision tests the code of GetReplicationAnalysis decision-making. It doesn't check the SQL query // run by it. It only checks the analysis part after the rows have been read. This tests fakes the db and explicitly returns the // rows that are specified in the test. @@ -297,10 +316,11 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, DurabilityPolicy: "none", - SourceHost: "localhost", - SourcePort: 6708, - LastCheckValid: 1, - ReadOnly: 0, + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + }, + LastCheckValid: 1, + ReadOnly: 0, }}, keyspaceWanted: "ks", shardWanted: "0", @@ -336,10 +356,11 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, DurabilityPolicy: "none", - SourceHost: "localhost", - SourcePort: 6706, - LastCheckValid: 1, - ReadOnly: 1, + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 102}, + }, + LastCheckValid: 1, + ReadOnly: 1, }}, keyspaceWanted: "ks", shardWanted: "0", @@ -374,9 +395,10 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { MysqlHostname: "localhost", MysqlPort: 6709, }, - DurabilityPolicy: "none", - SourceHost: "localhost", - SourcePort: 6708, + DurabilityPolicy: "none", + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + }, LastCheckValid: 1, ReadOnly: 1, ReplicationStopped: 1, @@ -417,17 +439,9 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, - Hostname: "localhost", - Keyspace: "ks", - Shard: "0", - Type: topodatapb.TabletType_PRIMARY, - MysqlHostname: "localhost", - MysqlPort: 6708, + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, }, DurabilityPolicy: "semi_sync", - SourceHost: "localhost", - SourcePort: 6708, LastCheckValid: 1, ReadOnly: 1, SemiSyncReplicaEnabled: 0, @@ -466,17 +480,9 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { MysqlPort: 6709, }, PrimaryTabletInfo: &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, - Hostname: "localhost", - Keyspace: "ks", - Shard: "0", - Type: topodatapb.TabletType_PRIMARY, - MysqlHostname: "localhost", - MysqlPort: 6708, + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, }, DurabilityPolicy: "none", - SourceHost: "localhost", - SourcePort: 6708, LastCheckValid: 1, ReadOnly: 1, SemiSyncReplicaEnabled: 1, @@ -600,20 +606,6 @@ func TestGetReplicationAnalysisDecision(t *testing.T) { // This test is somewhere between a unit test, and an end-to-end test. It is specifically useful for testing situations which are hard to come by in end-to-end test, but require // real-world data to test specifically. func TestGetReplicationAnalysis(t *testing.T) { - // The initialSQL is a set of insert commands copied from a dump of an actual running VTOrc instances. The relevant insert commands are here. - // This is a dump taken from a test running 4 tablets, zone1-101 is the primary, zone1-100 is a replica, zone1-112 is a rdonly and zone2-200 is a cross-cell replica. - initialSQL := []string{ - `INSERT INTO database_instance VALUES('localhost',6747,'2022-12-28 07:26:04','2022-12-28 07:26:04',213696377,'8.0.31','ROW',1,1,'vt-0000000112-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000112-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-9240-92a06c3be3c2','2022-12-28 07:26:04','',1,0,0,'zone1-0000000112','Homebrew','8.0','FULL',10816929,0,0,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-9240-92a06c3be3c2',1,1,'',1000000000000000000,1,0,0,0,'',0,'','','[]','',0);`, - `INSERT INTO database_instance VALUES('localhost',6711,'2022-12-28 07:26:04','2022-12-28 07:26:04',1094500338,'8.0.31','ROW',1,1,'vt-0000000100-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000100-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-acf8-d6b0ef9f4eaa','2022-12-28 07:26:04','',1,0,0,'zone1-0000000100','Homebrew','8.0','FULL',10103920,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-acf8-d6b0ef9f4eaa',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, - `INSERT INTO database_instance VALUES('localhost',6714,'2022-12-28 07:26:04','2022-12-28 07:26:04',390954723,'8.0.31','ROW',1,1,'vt-0000000101-bin.000001',15583,'',0,0,0,'',0,'',0,NULL,NULL,0,'','',0,0,'',0,0,0,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a4cc4-8680-11ed-a104-47706090afbd','2022-12-28 07:26:04','',0,0,0,'zone1-0000000101','Homebrew','8.0','FULL',11366095,1,1,'ON',1,'','','729a4cc4-8680-11ed-a104-47706090afbd',-1,-1,'',1000000000000000000,1,1,0,2,'',0,'','','[]','',0);`, - `INSERT INTO database_instance VALUES('localhost',6756,'2022-12-28 07:26:05','2022-12-28 07:26:05',444286571,'8.0.31','ROW',1,1,'vt-0000000200-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000200-relay-bin.000002',15815,0,1,0,'zone2','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a497c-8680-11ed-8ad4-3f51d747db75','2022-12-28 07:26:05','',1,0,0,'zone2-0000000200','Homebrew','8.0','FULL',10443112,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a497c-8680-11ed-8ad4-3f51d747db75',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`, - `INSERT INTO vitess_tablet VALUES('zone1-0000000100','localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('zone1-0000000101','localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('zone1-0000000112','localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_tablet VALUES('zone2-0000000200','localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`, - `INSERT INTO vitess_keyspace VALUES('ks',0,'semi_sync');`, - } - // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. tests := []struct { name string @@ -662,7 +654,7 @@ func TestGetReplicationAnalysis(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again. defer func() { db.ClearVTOrcDatabase() }() @@ -685,3 +677,75 @@ func TestGetReplicationAnalysis(t *testing.T) { }) } } + +// TestAuditInstanceAnalysisInChangelog tests the functionality of the auditInstanceAnalysisInChangelog function +// and verifies that we write the correct number of times to the database. +func TestAuditInstanceAnalysisInChangelog(t *testing.T) { + tests := []struct { + name string + cacheExpiration time.Duration + }{ + { + name: "Long expiration", + cacheExpiration: 2 * time.Minute, + }, { + name: "Very short expiration", + cacheExpiration: 100 * time.Millisecond, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create the cache for the test to use. + oldRecentInstantAnalysisCache := recentInstantAnalysis + oldAnalysisChangeWriteCounter := analysisChangeWriteCounter + + recentInstantAnalysis = cache.New(tt.cacheExpiration, 100*time.Millisecond) + analysisChangeWriteCounter = metrics.NewCounter() + + defer func() { + // Set the old values back. + recentInstantAnalysis = oldRecentInstantAnalysisCache + analysisChangeWriteCounter = oldAnalysisChangeWriteCounter + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again. + db.ClearVTOrcDatabase() + }() + + updates := []struct { + tabletAlias string + analysisCode AnalysisCode + writeCounterExpectation int + wantErr string + }{ + { + // Store a new analysis for the zone1-100 tablet. + tabletAlias: "zone1-100", + analysisCode: ReplicationStopped, + writeCounterExpectation: 1, + }, { + // Write the same analysis, no new write should happen. + tabletAlias: "zone1-100", + analysisCode: ReplicationStopped, + writeCounterExpectation: 1, + }, { + // Change the analysis. This should trigger an update. + tabletAlias: "zone1-100", + analysisCode: ReplicaSemiSyncMustBeSet, + writeCounterExpectation: 2, + }, + } + + for _, upd := range updates { + // We sleep 200 milliseconds to make sure that the cache has had time to update. + // It should be able to delete entries if the expiration is less than 200 milliseconds. + time.Sleep(200 * time.Millisecond) + err := auditInstanceAnalysisInChangelog(upd.tabletAlias, upd.analysisCode) + if upd.wantErr != "" { + require.EqualError(t, err, upd.wantErr) + continue + } + require.NoError(t, err) + require.EqualValues(t, upd.writeCounterExpectation, analysisChangeWriteCounter.Count()) + } + }) + } +} diff --git a/go/vt/vtorc/inst/audit.go b/go/vt/vtorc/inst/audit.go index 6650b01ac18..b094a293040 100644 --- a/go/vt/vtorc/inst/audit.go +++ b/go/vt/vtorc/inst/audit.go @@ -21,6 +21,6 @@ type Audit struct { AuditID int64 AuditTimestamp string AuditType string - AuditInstanceKey InstanceKey + AuditTabletAlias string Message string } diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index 7882449c655..22bc9395964 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -50,14 +50,11 @@ func EnableAuditSyslog() (err error) { } // AuditOperation creates and writes a new audit entry by given params -func AuditOperation(auditType string, instanceKey *InstanceKey, message string) error { - if instanceKey == nil { - instanceKey = &InstanceKey{} - } +func AuditOperation(auditType string, tabletAlias string, message string) error { keyspace := "" shard := "" - if instanceKey.Hostname != "" { - keyspace, shard, _ = GetKeyspaceShardName(instanceKey) + if tabletAlias != "" { + keyspace, shard, _ = GetKeyspaceShardName(tabletAlias) } auditWrittenToFile := false @@ -71,7 +68,7 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) } defer f.Close() - text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s:%s]\t%s\t\n", time.Now().Format("2006-01-02 15:04:05"), auditType, instanceKey.Hostname, instanceKey.Port, keyspace, shard, message) + text := fmt.Sprintf("%s\t%s\t%s\t[%s:%s]\t%s\t\n", time.Now().Format("2006-01-02 15:04:05"), auditType, tabletAlias, keyspace, shard, message) if _, err = f.WriteString(text); err != nil { log.Error(err) } @@ -81,14 +78,13 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) _, err := db.ExecVTOrc(` insert into audit ( - audit_timestamp, audit_type, hostname, port, keyspace, shard, message + audit_timestamp, audit_type, alias, keyspace, shard, message ) VALUES ( - NOW(), ?, ?, ?, ?, ?, ? + NOW(), ?, ?, ?, ?, ? ) `, auditType, - instanceKey.Hostname, - instanceKey.Port, + tabletAlias, keyspace, shard, message, @@ -98,7 +94,7 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) return err } } - logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, instanceKey.DisplayString(), keyspace, shard, message) + logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message) if syslogWriter != nil { auditWrittenToFile = true go func() { @@ -114,21 +110,20 @@ func AuditOperation(auditType string, instanceKey *InstanceKey, message string) } // ReadRecentAudit returns a list of audit entries order chronologically descending, using page number. -func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) { +func ReadRecentAudit(tabletAlias string, page int) ([]Audit, error) { res := []Audit{} args := sqlutils.Args() whereCondition := `` - if instanceKey != nil { - whereCondition = `where hostname=? and port=?` - args = append(args, instanceKey.Hostname, instanceKey.Port) + if tabletAlias != "" { + whereCondition = `where alias=?` + args = append(args, tabletAlias) } query := fmt.Sprintf(` select audit_id, audit_timestamp, audit_type, - hostname, - port, + alias, message from audit @@ -144,8 +139,7 @@ func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) { audit.AuditID = m.GetInt64("audit_id") audit.AuditTimestamp = m.GetString("audit_timestamp") audit.AuditType = m.GetString("audit_type") - audit.AuditInstanceKey.Hostname = m.GetString("hostname") - audit.AuditInstanceKey.Port = m.GetInt("port") + audit.AuditTabletAlias = m.GetString("alias") audit.Message = m.GetString("message") res = append(res, audit) diff --git a/go/vt/vtorc/inst/audit_dao_test.go b/go/vt/vtorc/inst/audit_dao_test.go index 4a6533077c2..2b3c3bfad64 100644 --- a/go/vt/vtorc/inst/audit_dao_test.go +++ b/go/vt/vtorc/inst/audit_dao_test.go @@ -24,11 +24,13 @@ import ( "github.com/stretchr/testify/require" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" ) // TestAuditOperation tests that auditing a operation works as intended based on the configurations. +// This test also verifies that we are able to read the recent audits that are written to the databaes. func TestAuditOperation(t *testing.T) { // Restore original configurations originalAuditSysLog := config.Config.AuditToSyslog @@ -69,10 +71,7 @@ func TestAuditOperation(t *testing.T) { err = SaveTablet(tab100) require.NoError(t, err) - instance100 := &InstanceKey{ - Hostname: hostname, - Port: int(port), - } + tab100Alias := topoproto.TabletAliasString(tab100.Alias) auditType := "test-audit-operation" message := "test-message" @@ -82,16 +81,26 @@ func TestAuditOperation(t *testing.T) { config.Config.AuditToBackendDB = true // Auditing should succeed as expected - err = AuditOperation(auditType, instance100, message) + err = AuditOperation(auditType, tab100Alias, message) + require.NoError(t, err) + + // Check that we can read the recent audits + audits, err := ReadRecentAudit(tab100Alias, 0) require.NoError(t, err) + require.Len(t, audits, 1) + require.EqualValues(t, 1, audits[0].AuditID) + require.EqualValues(t, auditType, audits[0].AuditType) + require.EqualValues(t, message, audits[0].Message) + require.EqualValues(t, tab100Alias, audits[0].AuditTabletAlias) - audits, err := ReadRecentAudit(instance100, 0) + // Check the same for no-filtering + audits, err = ReadRecentAudit("", 0) require.NoError(t, err) require.Len(t, audits, 1) require.EqualValues(t, 1, audits[0].AuditID) require.EqualValues(t, auditType, audits[0].AuditType) require.EqualValues(t, message, audits[0].Message) - require.EqualValues(t, *instance100, audits[0].AuditInstanceKey) + require.EqualValues(t, tab100Alias, audits[0].AuditTabletAlias) }) t.Run("Audit to File", func(t *testing.T) { @@ -103,7 +112,7 @@ func TestAuditOperation(t *testing.T) { defer os.Remove(file.Name()) config.Config.AuditLogFile = file.Name() - err = AuditOperation(auditType, instance100, message) + err = AuditOperation(auditType, tab100Alias, message) require.NoError(t, err) // Give a little time for the write to succeed since it happens in a separate go-routine @@ -112,6 +121,6 @@ func TestAuditOperation(t *testing.T) { time.Sleep(100 * time.Millisecond) fileContent, err := os.ReadFile(file.Name()) require.NoError(t, err) - require.Contains(t, string(fileContent), "\ttest-audit-operation\tlocalhost\t100\t[ks:0]\ttest-message") + require.Contains(t, string(fileContent), "\ttest-audit-operation\tzone-1-0000000100\t[ks:0]\ttest-message") }) } diff --git a/go/vt/vtorc/inst/candidate_database_instance.go b/go/vt/vtorc/inst/candidate_database_instance.go deleted file mode 100644 index 5cd4b5c6a0b..00000000000 --- a/go/vt/vtorc/inst/candidate_database_instance.go +++ /dev/null @@ -1,56 +0,0 @@ -/* - Copyright 2016 Simon J Mudd - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - - "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// CandidateDatabaseInstance contains information about explicit promotion rules for an instance -type CandidateDatabaseInstance struct { - Hostname string - Port int - PromotionRule promotionrule.CandidatePromotionRule - LastSuggestedString string - PromotionRuleExpiry string // generated when retrieved from database for consistency reasons -} - -func NewCandidateDatabaseInstance(instanceKey *InstanceKey, promotionRule promotionrule.CandidatePromotionRule) *CandidateDatabaseInstance { - return &CandidateDatabaseInstance{ - Hostname: instanceKey.Hostname, - Port: instanceKey.Port, - PromotionRule: promotionRule, - } -} - -func (cdi *CandidateDatabaseInstance) WithCurrentTime() *CandidateDatabaseInstance { - cdi.LastSuggestedString, _ = db.ReadTimeNow() - return cdi -} - -// String returns a string representation of the CandidateDatabaseInstance struct -func (cdi *CandidateDatabaseInstance) String() string { - return fmt.Sprintf("%s:%d %s", cdi.Hostname, cdi.Port, cdi.PromotionRule) -} - -// Key returns an instance key representing this candidate -func (cdi *CandidateDatabaseInstance) Key() *InstanceKey { - return &InstanceKey{Hostname: cdi.Hostname, Port: cdi.Port} -} diff --git a/go/vt/vtorc/inst/candidate_database_instance_dao.go b/go/vt/vtorc/inst/candidate_database_instance_dao.go deleted file mode 100644 index 95bbb53f617..00000000000 --- a/go/vt/vtorc/inst/candidate_database_instance_dao.go +++ /dev/null @@ -1,69 +0,0 @@ -/* - Copyright 2016 Simon J Mudd - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// RegisterCandidateInstance markes a given instance as suggested for succeeding a primary in the event of failover. -func RegisterCandidateInstance(candidate *CandidateDatabaseInstance) error { - if candidate.LastSuggestedString == "" { - candidate = candidate.WithCurrentTime() - } - args := sqlutils.Args(candidate.Hostname, candidate.Port, string(candidate.PromotionRule), candidate.LastSuggestedString) - - query := ` - insert into candidate_database_instance ( - hostname, - port, - promotion_rule, - last_suggested - ) values ( - ?, ?, ?, ? - ) on duplicate key update - last_suggested=values(last_suggested), - promotion_rule=values(promotion_rule) - ` - writeFunc := func() error { - _, err := db.ExecVTOrc(query, args...) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} - -// ExpireCandidateInstances removes stale primary candidate suggestions. -func ExpireCandidateInstances() error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - delete from candidate_database_instance - where last_suggested < NOW() - INTERVAL ? MINUTE - `, config.CandidateInstanceExpireMinutes, - ) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} diff --git a/go/vt/vtorc/inst/downtime.go b/go/vt/vtorc/inst/downtime.go deleted file mode 100644 index 7110df1e60b..00000000000 --- a/go/vt/vtorc/inst/downtime.go +++ /dev/null @@ -1,52 +0,0 @@ -/* - Copyright 2017 Shlomi Noach, GitHub Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "time" -) - -type Downtime struct { - Key *InstanceKey - Owner string - Reason string - Duration time.Duration - BeginsAt time.Time - EndsAt time.Time - BeginsAtString string - EndsAtString string -} - -func NewDowntime(instanceKey *InstanceKey, owner string, reason string, duration time.Duration) *Downtime { - downtime := &Downtime{ - Key: instanceKey, - Owner: owner, - Reason: reason, - Duration: duration, - BeginsAt: time.Now(), - } - downtime.EndsAt = downtime.BeginsAt.Add(downtime.Duration) - return downtime -} - -func (downtime *Downtime) Ended() bool { - return downtime.EndsAt.Before(time.Now()) -} - -func (downtime *Downtime) EndsIn() time.Duration { - return time.Until(downtime.EndsAt) -} diff --git a/go/vt/vtorc/inst/downtime_dao.go b/go/vt/vtorc/inst/downtime_dao.go deleted file mode 100644 index 53b12e325e8..00000000000 --- a/go/vt/vtorc/inst/downtime_dao.go +++ /dev/null @@ -1,193 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - "time" - - "vitess.io/vitess/go/vt/log" - - "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// BeginDowntime will make mark an instance as downtimed (or override existing downtime period) -func BeginDowntime(downtime *Downtime) (err error) { - if downtime.Duration == 0 { - downtime.Duration = config.MaintenanceExpireMinutes * time.Minute - } - if downtime.EndsAtString != "" { - _, err = db.ExecVTOrc(` - insert - into database_instance_downtime ( - hostname, port, downtime_active, begin_timestamp, end_timestamp, owner, reason - ) VALUES ( - ?, ?, 1, ?, ?, ?, ? - ) - on duplicate key update - downtime_active=values(downtime_active), - begin_timestamp=values(begin_timestamp), - end_timestamp=values(end_timestamp), - owner=values(owner), - reason=values(reason) - `, - downtime.Key.Hostname, - downtime.Key.Port, - downtime.BeginsAtString, - downtime.EndsAtString, - downtime.Owner, - downtime.Reason, - ) - } else { - if downtime.Ended() { - // No point in writing it down; it's expired - return nil - } - - _, err = db.ExecVTOrc(` - insert - into database_instance_downtime ( - hostname, port, downtime_active, begin_timestamp, end_timestamp, owner, reason - ) VALUES ( - ?, ?, 1, NOW(), NOW() + INTERVAL ? SECOND, ?, ? - ) - on duplicate key update - downtime_active=values(downtime_active), - begin_timestamp=values(begin_timestamp), - end_timestamp=values(end_timestamp), - owner=values(owner), - reason=values(reason) - `, - downtime.Key.Hostname, - downtime.Key.Port, - int(downtime.EndsIn().Seconds()), - downtime.Owner, - downtime.Reason, - ) - } - if err != nil { - log.Error(err) - return err - } - _ = AuditOperation("begin-downtime", downtime.Key, fmt.Sprintf("owner: %s, reason: %s", downtime.Owner, downtime.Reason)) - - return nil -} - -// EndDowntime will remove downtime flag from an instance -func EndDowntime(instanceKey *InstanceKey) (wasDowntimed bool, err error) { - res, err := db.ExecVTOrc(` - delete from - database_instance_downtime - where - hostname = ? - and port = ? - `, - instanceKey.Hostname, - instanceKey.Port, - ) - if err != nil { - log.Error(err) - return wasDowntimed, err - } - - if affected, _ := res.RowsAffected(); affected > 0 { - wasDowntimed = true - _ = AuditOperation("end-downtime", instanceKey, "") - } - return wasDowntimed, err -} - -// renewLostInRecoveryDowntime renews hosts who are downtimed due to being lost in recovery, such that -// their downtime never expires. -func renewLostInRecoveryDowntime() error { - _, err := db.ExecVTOrc(` - update - database_instance_downtime - set - end_timestamp = NOW() + INTERVAL ? SECOND - where - end_timestamp > NOW() - and reason = ? - `, - config.LostInRecoveryDowntimeSeconds, - DowntimeLostInRecoveryMessage, - ) - - return err -} - -// expireLostInRecoveryDowntime expires downtime for servers who have been lost in recovery in the last, -// but are now replicating. -func expireLostInRecoveryDowntime() error { - instances, err := ReadLostInRecoveryInstances("", "") - if err != nil { - return err - } - if len(instances) == 0 { - return nil - } - for _, instance := range instances { - // We _may_ expire this downtime, but only after a minute - // This is a graceful period, during which other servers can claim ownership of the alias, - // or can update their own cluster name to match a new primary's name - if instance.ElapsedDowntime < time.Minute { - continue - } - if !instance.IsLastCheckValid { - continue - } - if instance.ReplicaRunning() { - // back, alive, replicating in some topology - if _, err := EndDowntime(&instance.Key); err != nil { - return err - } - } - } - return nil -} - -// ExpireDowntime will remove the maintenance flag on old downtimes -func ExpireDowntime() error { - if err := renewLostInRecoveryDowntime(); err != nil { - log.Error(err) - return err - } - if err := expireLostInRecoveryDowntime(); err != nil { - log.Error(err) - return err - } - { - res, err := db.ExecVTOrc(` - delete from - database_instance_downtime - where - end_timestamp < NOW() - `, - ) - if err != nil { - log.Error(err) - return err - } - if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { - _ = AuditOperation("expire-downtime", nil, fmt.Sprintf("Expired %d entries", rowsAffected)) - } - } - - return nil -} diff --git a/go/vt/vtorc/inst/durability.go b/go/vt/vtorc/inst/durability.go deleted file mode 100644 index 272fa838af8..00000000000 --- a/go/vt/vtorc/inst/durability.go +++ /dev/null @@ -1,83 +0,0 @@ -/* -Copyright 2020 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package inst - -import ( - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/vtctl/reparentutil" - "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" -) - -// IsReplicaSemiSync returns the replica semi-sync setting for the instance. -func IsReplicaSemiSync[V InstanceKey | *topodatapb.Tablet](durabilityPolicy reparentutil.Durabler, primaryInstance V, replicaInstance V) bool { - primary, err := getTablet(primaryInstance) - if err != nil { - return false - } - replica, err := getTablet(replicaInstance) - if err != nil { - return false - } - return reparentutil.IsReplicaSemiSync(durabilityPolicy, primary, replica) -} - -// SemiSyncAckers returns the primary semi-sync setting for the instance. -// 0 means none. Non-zero specifies the number of required ackers. -func SemiSyncAckers[V InstanceKey | *topodatapb.Tablet](durabilityPolicy reparentutil.Durabler, instance V) int { - primary, err := getTablet(instance) - if err != nil { - return 0 - } - return reparentutil.SemiSyncAckers(durabilityPolicy, primary) -} - -// PromotionRule returns the promotion rule for the instance. -func PromotionRule[V InstanceKey | *topodatapb.Tablet](durabilityPolicy reparentutil.Durabler, instance V) promotionrule.CandidatePromotionRule { - tablet, err := getTablet(instance) - if err != nil { - return promotionrule.MustNot - } - return reparentutil.PromotionRule(durabilityPolicy, tablet) -} - -func getTablet[V InstanceKey | *topodatapb.Tablet](instance V) (*topodatapb.Tablet, error) { - var instanceTablet *topodatapb.Tablet - var err error - switch node := any(instance).(type) { - case InstanceKey: - instanceTablet, err = ReadTablet(node) - if err != nil { - return nil, err - } - case *topodatapb.Tablet: - instanceTablet = node - } - return instanceTablet, nil -} - -// GetDurabilityPolicy gets the durability policy for the keyspace of the given instance -func GetDurabilityPolicy[V InstanceKey | *topodatapb.Tablet](instance V) (reparentutil.Durabler, error) { - tablet, err := getTablet(instance) - if err != nil { - return nil, err - } - ki, err := ReadKeyspace(tablet.Keyspace) - if err != nil { - return nil, err - } - return reparentutil.GetDurabilityPolicy(ki.DurabilityPolicy) -} diff --git a/go/vt/vtorc/inst/instance.go b/go/vt/vtorc/inst/instance.go index dd1526ff090..39425f718ee 100644 --- a/go/vt/vtorc/inst/instance.go +++ b/go/vt/vtorc/inst/instance.go @@ -21,16 +21,13 @@ import ( "encoding/json" "strings" "time" - - "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" ) -const ReasonableDiscoveryLatency = 500 * time.Millisecond - // Instance represents a database instance, including its current configuration & status. // It presents important replication configuration and detailed replication status. type Instance struct { - Key InstanceKey + Hostname string + Port int InstanceAlias string ServerID uint ServerUUID string @@ -43,10 +40,10 @@ type Instance struct { LogBinEnabled bool LogReplicationUpdatesEnabled bool SelfBinlogCoordinates BinlogCoordinates - SourceKey InstanceKey + SourceHost string + SourcePort int SourceUUID string AncestryUUID string - IsDetachedPrimary bool ReplicationSQLThreadRuning bool ReplicationIOThreadRuning bool @@ -95,50 +92,20 @@ type Instance struct { IsRecentlyChecked bool SecondsSinceLastSeen sql.NullInt64 - // Careful. IsCandidate and PromotionRule are used together - // and probably need to be merged. IsCandidate's value may - // be picked up from daabase_candidate_instance's value when - // reading an instance from the db. - IsCandidate bool - PromotionRule promotionrule.CandidatePromotionRule - IsDowntimed bool - DowntimeReason string - DowntimeOwner string - DowntimeEndTimestamp string - ElapsedDowntime time.Duration - UnresolvedHostname string - AllowTLS bool + AllowTLS bool Problems []string LastDiscoveryLatency time.Duration seed bool // Means we force this instance to be written to backend, even if it's invalid, empty or forgotten - - /* All things Group Replication below */ - - // Group replication global variables - ReplicationGroupName string - ReplicationGroupIsSinglePrimary bool - - // Replication group members information. See - // https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for details. - ReplicationGroupMemberState string - ReplicationGroupMemberRole string - - // List of all known members of the same group - ReplicationGroupMembers InstanceKeyMap - - // Primary of the replication group - ReplicationGroupPrimaryInstanceKey InstanceKey } // NewInstance creates a new, empty instance func NewInstance() *Instance { return &Instance{ - ReplicationGroupMembers: make(map[InstanceKey]bool), - Problems: []string{}, + Problems: []string{}, } } @@ -154,7 +121,7 @@ func (instance *Instance) MarshalJSON() ([]byte, error) { // Equals tests that this instance is the same instance as other. The function does not test // configuration or status. func (instance *Instance) Equals(other *Instance) bool { - return instance.Key == other.Key + return instance.InstanceAlias == other.InstanceAlias } // MajorVersion returns this instance's major version number (e.g. for 5.5.36 it returns "5.5") @@ -219,21 +186,6 @@ func (instance *Instance) IsNDB() bool { return strings.Contains(instance.Version, "-ndb-") } -// IsReplicationGroup checks whether the host thinks it is part of a known replication group. Notice that this might -// return True even if the group has decided to expel the member represented by this instance, as the instance might not -// know that under certain circumstances -func (instance *Instance) IsReplicationGroupMember() bool { - return instance.ReplicationGroupName != "" -} - -func (instance *Instance) IsReplicationGroupPrimary() bool { - return instance.IsReplicationGroupMember() && instance.ReplicationGroupPrimaryInstanceKey.Equals(&instance.Key) -} - -func (instance *Instance) IsReplicationGroupSecondary() bool { - return instance.IsReplicationGroupMember() && !instance.ReplicationGroupPrimaryInstanceKey.Equals(&instance.Key) -} - // IsBinlogServer checks whether this is any type of a binlog server func (instance *Instance) IsBinlogServer() bool { return false @@ -288,27 +240,13 @@ func (instance *Instance) FlavorNameAndMajorVersion() string { // IsReplica makes simple heuristics to decide whether this instance is a replica of another instance func (instance *Instance) IsReplica() bool { - return instance.SourceKey.Hostname != "" && instance.SourceKey.Hostname != "_" && instance.SourceKey.Port != 0 && (instance.ReadBinlogCoordinates.LogFile != "" || instance.UsingGTID()) + return instance.SourceHost != "" && instance.SourceHost != "_" && instance.SourcePort != 0 && (instance.ReadBinlogCoordinates.LogFile != "" || instance.UsingGTID()) } // IsPrimary makes simple heuristics to decide whether this instance is a primary (not replicating from any other server), // either via traditional async/semisync replication or group replication func (instance *Instance) IsPrimary() bool { - // If traditional replication is configured, it is for sure not a primary - if instance.IsReplica() { - return false - } - // If traditional replication is not configured, and it is also not part of a replication group, this host is - // a primary - if !instance.IsReplicationGroupMember() { - return true - } - // If traditional replication is not configured, and this host is part of a group, it is only considered a - // primary if it has the role of group Primary. Otherwise it is not a primary. - if instance.ReplicationGroupMemberRole == GroupReplicationMemberRolePrimary { - return true - } - return false + return !instance.IsReplica() } // ReplicaRunning returns true when this instance's status is of a replicating replica. @@ -335,8 +273,3 @@ func (instance *Instance) SQLThreadUpToDate() bool { func (instance *Instance) UsingGTID() bool { return instance.UsingOracleGTID || instance.UsingMariaDBGTID } - -// AddGroupMemberKey adds a group member to the list of this instance's group members. -func (instance *Instance) AddGroupMemberKey(groupMemberKey *InstanceKey) { - instance.ReplicationGroupMembers.AddKey(*groupMemberKey) -} diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 7d122ea61a0..18ddc4bf3e4 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -40,8 +40,6 @@ import ( replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/vtctl/reparentutil" - "vitess.io/vitess/go/vt/vtctl/reparentutil/promotionrule" "vitess.io/vitess/go/vt/vtorc/collection" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" @@ -63,21 +61,7 @@ var ( errantGtidMap = make(map[string]string) ) -// Constant strings for Group Replication information -// See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information. -const ( - // Group member roles - GroupReplicationMemberRolePrimary = "PRIMARY" - GroupReplicationMemberRoleSecondary = "SECONDARY" - // Group member states - GroupReplicationMemberStateOnline = "ONLINE" - GroupReplicationMemberStateRecovering = "RECOVERING" - GroupReplicationMemberStateUnreachable = "UNREACHABLE" - GroupReplicationMemberStateOffline = "OFFLINE" - GroupReplicationMemberStateError = "ERROR" -) - -var forgetInstanceKeys *cache.Cache +var forgetAliases *cache.Cache var accessDeniedCounter = metrics.NewCounter() var readTopologyInstanceCounter = metrics.NewCounter() @@ -106,7 +90,7 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() - forgetInstanceKeys = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) } // ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not @@ -149,19 +133,19 @@ func ExpireTableData(tableName string, timestampColumn string) error { // logReadTopologyInstanceError logs an error, if applicable, for a ReadTopologyInstance operation, // providing context and hint as for the source of the error. If there's no hint just provide the // original error. -func logReadTopologyInstanceError(instanceKey *InstanceKey, hint string, err error) error { +func logReadTopologyInstanceError(tabletAlias string, hint string, err error) error { if err == nil { return nil } - if !util.ClearToLog("ReadTopologyInstance", instanceKey.StringCode()) { + if !util.ClearToLog("ReadTopologyInstance", tabletAlias) { return err } var msg string if hint == "" { - msg = fmt.Sprintf("ReadTopologyInstance(%+v): %+v", *instanceKey, err) + msg = fmt.Sprintf("ReadTopologyInstance(%+v): %+v", tabletAlias, err) } else { msg = fmt.Sprintf("ReadTopologyInstance(%+v) %+v: %+v", - *instanceKey, + tabletAlias, strings.Replace(hint, "%", "%%", -1), // escape % err) } @@ -172,8 +156,8 @@ func logReadTopologyInstanceError(instanceKey *InstanceKey, hint string, err err // ReadTopologyInstance collects information on the state of a MySQL // server and writes the result synchronously to the vtorc // backend. -func ReadTopologyInstance(instanceKey *InstanceKey) (*Instance, error) { - return ReadTopologyInstanceBufferable(instanceKey, nil) +func ReadTopologyInstance(tabletAlias string) (*Instance, error) { + return ReadTopologyInstanceBufferable(tabletAlias, nil) } // ReadTopologyInstanceBufferable connects to a topology MySQL instance @@ -181,43 +165,35 @@ func ReadTopologyInstance(instanceKey *InstanceKey) (*Instance, error) { // It writes the information retrieved into vtorc's backend. // - writes are optionally buffered. // - timing information can be collected for the stages performed. -func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch.NamedStopwatch) (inst *Instance, err error) { +func ReadTopologyInstanceBufferable(tabletAlias string, latency *stopwatch.NamedStopwatch) (inst *Instance, err error) { defer func() { if r := recover(); r != nil { - err = logReadTopologyInstanceError(instanceKey, "Unexpected, aborting", tb.Errorf("%+v", r)) + err = logReadTopologyInstanceError(tabletAlias, "Unexpected, aborting", tb.Errorf("%+v", r)) } }() var waitGroup sync.WaitGroup var tablet *topodatapb.Tablet - var durability reparentutil.Durabler var fullStatus *replicationdatapb.FullStatus readingStartTime := time.Now() instance := NewInstance() instanceFound := false partialSuccess := false - resolvedHostname := "" errorChan := make(chan error, 32) - var resolveErr error - if !instanceKey.IsValid() { - latency.Start("backend") - if err := UpdateInstanceLastAttemptedCheck(instanceKey); err != nil { - log.Errorf("ReadTopologyInstanceBufferable: %+v: %v", instanceKey, err) - } - latency.Stop("backend") - return instance, fmt.Errorf("ReadTopologyInstance will not act on invalid instance key: %+v", *instanceKey) + if tabletAlias == "" { + return instance, fmt.Errorf("ReadTopologyInstance will not act on empty tablet alias") } lastAttemptedCheckTimer := time.AfterFunc(time.Second, func() { go func() { - _ = UpdateInstanceLastAttemptedCheck(instanceKey) + _ = UpdateInstanceLastAttemptedCheck(tabletAlias) }() }) latency.Start("instance") - tablet, err = ReadTablet(*instanceKey) + tablet, err = ReadTablet(tabletAlias) if err != nil { goto Cleanup } @@ -228,18 +204,14 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch goto Cleanup } - durability, err = GetDurabilityPolicy(tablet) - if err != nil { - goto Cleanup - } - - fullStatus, err = FullStatus(*instanceKey) + fullStatus, err = FullStatus(tabletAlias) if err != nil { goto Cleanup } partialSuccess = true // We at least managed to read something from the server. - instance.Key = *instanceKey + instance.Hostname = tablet.MysqlHostname + instance.Port = int(tablet.MysqlPort) { // We begin with a few operations we can run concurrently, and which do not depend on anything instance.ServerID = uint(fullStatus.ServerId) @@ -249,7 +221,6 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch instance.BinlogFormat = fullStatus.BinlogFormat instance.LogReplicationUpdatesEnabled = fullStatus.LogReplicaUpdates instance.VersionComment = fullStatus.VersionComment - resolvedHostname = instance.Key.Hostname if instance.LogBinEnabled && fullStatus.PrimaryStatus != nil { binlogPos, err := getBinlogCoordinatesFromPositionString(fullStatus.PrimaryStatus.FilePosition) @@ -291,19 +262,6 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch } } } - if resolvedHostname != instance.Key.Hostname { - latency.Start("backend") - UpdateResolvedHostname(instance.Key.Hostname, resolvedHostname) - latency.Stop("backend") - instance.Key.Hostname = resolvedHostname - } - if instance.Key.Hostname == "" { - err = fmt.Errorf("ReadTopologyInstance: empty hostname (%+v). Bailing out", *instanceKey) - goto Cleanup - } - go func() { - _ = ResolveHostnameIPs(instance.Key.Hostname) - }() instance.ReplicationIOThreadState = ReplicationThreadStateNoThread instance.ReplicationSQLThreadState = ReplicationThreadStateNoThread @@ -338,17 +296,8 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch instance.SourceUUID = fullStatus.ReplicationStatus.SourceUuid instance.HasReplicationFilters = fullStatus.ReplicationStatus.HasReplicationFilters - primaryHostname := fullStatus.ReplicationStatus.SourceHost - primaryKey, err := NewResolveInstanceKey(primaryHostname, int(fullStatus.ReplicationStatus.SourcePort)) - if err != nil { - _ = logReadTopologyInstanceError(instanceKey, "NewResolveInstanceKey", err) - } - primaryKey.Hostname, resolveErr = ResolveHostname(primaryKey.Hostname) - if resolveErr != nil { - _ = logReadTopologyInstanceError(instanceKey, fmt.Sprintf("ResolveHostname(%q)", primaryKey.Hostname), resolveErr) - } - instance.SourceKey = *primaryKey - instance.IsDetachedPrimary = instance.SourceKey.IsDetached() + instance.SourceHost = fullStatus.ReplicationStatus.SourceHost + instance.SourcePort = int(fullStatus.ReplicationStatus.SourcePort) if fullStatus.ReplicationStatus.ReplicationLagUnknown { instance.SecondsBehindPrimary.Valid = false @@ -357,7 +306,7 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch instance.SecondsBehindPrimary.Int64 = int64(fullStatus.ReplicationStatus.ReplicationLagSeconds) } if instance.SecondsBehindPrimary.Valid && instance.SecondsBehindPrimary.Int64 < 0 { - log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", instanceKey, instance.SecondsBehindPrimary.Int64) + log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64) instance.SecondsBehindPrimary.Int64 = 0 } // And until told otherwise: @@ -380,16 +329,9 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, latency *stopwatch latency.Start("backend") err = ReadInstanceClusterAttributes(instance) latency.Stop("backend") - _ = logReadTopologyInstanceError(instanceKey, "ReadInstanceClusterAttributes", err) + _ = logReadTopologyInstanceError(tabletAlias, "ReadInstanceClusterAttributes", err) } - // We need to update candidate_database_instance. - // We register the rule even if it hasn't changed, - // to bump the last_suggested time. - instance.PromotionRule = PromotionRule(durability, tablet) - err = RegisterCandidateInstance(NewCandidateDatabaseInstance(instanceKey, instance.PromotionRule).WithCurrentTime()) - _ = logReadTopologyInstanceError(instanceKey, "RegisterCandidateInstance", err) - Cleanup: waitGroup.Wait() close(errorChan) @@ -415,7 +357,6 @@ Cleanup: } // Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group // members and its replicas, even though they are not. - instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.AncestryUUID, instance.ReplicationGroupName) instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",") if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" { // Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID. @@ -468,7 +409,7 @@ Cleanup: // tried to check the instance. last_attempted_check is also // updated on success by writeInstance. latency.Start("backend") - _ = UpdateInstanceLastChecked(instanceKey, partialSuccess) + _ = UpdateInstanceLastChecked(tabletAlias, partialSuccess) latency.Stop("backend") return nil, err } @@ -490,41 +431,15 @@ func getBinlogCoordinatesFromPositionString(position string) (BinlogCoordinates, return *binLogCoordinates, nil } -func ReadReplicationGroupPrimary(instance *Instance) (err error) { - query := ` - SELECT - replication_group_primary_host, - replication_group_primary_port - FROM - database_instance - WHERE - replication_group_name = ? - AND replication_group_member_role = 'PRIMARY' -` - queryArgs := sqlutils.Args(instance.ReplicationGroupName) - err = db.QueryVTOrc(query, queryArgs, func(row sqlutils.RowMap) error { - groupPrimaryHost := row.GetString("replication_group_primary_host") - groupPrimaryPort := row.GetInt("replication_group_primary_port") - resolvedGroupPrimary, err := NewResolveInstanceKey(groupPrimaryHost, groupPrimaryPort) - if err != nil { - return err - } - instance.ReplicationGroupPrimaryInstanceKey = *resolvedGroupPrimary - return nil - }) - return err -} - // ReadInstanceClusterAttributes will return the cluster name for a given instance by looking at its primary // and getting it from there. // It is a non-recursive function and so-called-recursion is performed upon periodic reading of // instances. func ReadInstanceClusterAttributes(instance *Instance) (err error) { - var primaryOrGroupPrimaryInstanceKey InstanceKey - var primaryOrGroupPrimaryReplicationDepth uint + var primaryReplicationDepth uint var ancestryUUID string - var primaryOrGroupPrimaryExecutedGtidSet string - primaryOrGroupPrimaryDataFound := false + var primaryExecutedGtidSet string + primaryDataFound := false query := ` select @@ -536,22 +451,16 @@ func ReadInstanceClusterAttributes(instance *Instance) (err error) { from database_instance where hostname=? and port=? ` - // For instances that are part of a replication group, if the host is not the group's primary, we use the - // information from the group primary. If it is the group primary, we use the information of its primary - // (if it has any). If it is not a group member, we use the information from the host's primary. - if instance.IsReplicationGroupSecondary() { - primaryOrGroupPrimaryInstanceKey = instance.ReplicationGroupPrimaryInstanceKey - } else { - primaryOrGroupPrimaryInstanceKey = instance.SourceKey - } - args := sqlutils.Args(primaryOrGroupPrimaryInstanceKey.Hostname, primaryOrGroupPrimaryInstanceKey.Port) + primaryHostname := instance.SourceHost + primaryPort := instance.SourcePort + args := sqlutils.Args(primaryHostname, primaryPort) err = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - primaryOrGroupPrimaryReplicationDepth = m.GetUint("replication_depth") - primaryOrGroupPrimaryInstanceKey.Hostname = m.GetString("source_host") - primaryOrGroupPrimaryInstanceKey.Port = m.GetInt("source_port") + primaryReplicationDepth = m.GetUint("replication_depth") + primaryHostname = m.GetString("source_host") + primaryPort = m.GetInt("source_port") ancestryUUID = m.GetString("ancestry_uuid") - primaryOrGroupPrimaryExecutedGtidSet = m.GetString("executed_gtid_set") - primaryOrGroupPrimaryDataFound = true + primaryExecutedGtidSet = m.GetString("executed_gtid_set") + primaryDataFound = true return nil }) if err != nil { @@ -560,18 +469,18 @@ func ReadInstanceClusterAttributes(instance *Instance) (err error) { } var replicationDepth uint - if primaryOrGroupPrimaryDataFound { - replicationDepth = primaryOrGroupPrimaryReplicationDepth + 1 + if primaryDataFound { + replicationDepth = primaryReplicationDepth + 1 } isCoPrimary := false - if primaryOrGroupPrimaryInstanceKey.Equals(&instance.Key) { + if primaryHostname == instance.Hostname && primaryPort == instance.Port { // co-primary calls for special case, in fear of the infinite loop isCoPrimary = true } instance.ReplicationDepth = replicationDepth instance.IsCoPrimary = isCoPrimary instance.AncestryUUID = ancestryUUID - instance.primaryExecutedGtidSet = primaryOrGroupPrimaryExecutedGtidSet + instance.primaryExecutedGtidSet = primaryExecutedGtidSet return nil } @@ -579,8 +488,8 @@ func ReadInstanceClusterAttributes(instance *Instance) (err error) { func readInstanceRow(m sqlutils.RowMap) *Instance { instance := NewInstance() - instance.Key.Hostname = m.GetString("hostname") - instance.Key.Port = m.GetInt("port") + instance.Hostname = m.GetString("hostname") + instance.Port = m.GetInt("port") instance.ServerID = m.GetUint("server_id") instance.ServerUUID = m.GetString("server_uuid") instance.Version = m.GetString("version") @@ -590,9 +499,8 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.BinlogRowImage = m.GetString("binlog_row_image") instance.LogBinEnabled = m.GetBool("log_bin") instance.LogReplicationUpdatesEnabled = m.GetBool("log_replica_updates") - instance.SourceKey.Hostname = m.GetString("source_host") - instance.SourceKey.Port = m.GetInt("source_port") - instance.IsDetachedPrimary = instance.SourceKey.IsDetached() + instance.SourceHost = m.GetString("source_host") + instance.SourcePort = m.GetInt("source_port") instance.ReplicationSQLThreadRuning = m.GetBool("replica_sql_running") instance.ReplicationIOThreadRuning = m.GetBool("replica_io_running") instance.ReplicationSQLThreadState = ReplicationThreadState(m.GetInt("replication_sql_thread_state")) @@ -641,30 +549,12 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.LastSeenTimestamp = m.GetString("last_seen") instance.IsLastCheckValid = m.GetBool("is_last_check_valid") instance.SecondsSinceLastSeen = m.GetNullInt64("seconds_since_last_seen") - instance.IsCandidate = m.GetBool("is_candidate") - instance.PromotionRule = promotionrule.CandidatePromotionRule(m.GetString("promotion_rule")) - instance.IsDowntimed = m.GetBool("is_downtimed") - instance.DowntimeReason = m.GetString("downtime_reason") - instance.DowntimeOwner = m.GetString("downtime_owner") - instance.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") - instance.ElapsedDowntime = time.Second * time.Duration(m.GetInt("elapsed_downtime_seconds")) - instance.UnresolvedHostname = m.GetString("unresolved_hostname") instance.AllowTLS = m.GetBool("allow_tls") - instance.InstanceAlias = m.GetString("instance_alias") + instance.InstanceAlias = m.GetString("alias") instance.LastDiscoveryLatency = time.Duration(m.GetInt64("last_discovery_latency")) * time.Nanosecond instance.applyFlavorName() - /* Read Group Replication variables below */ - instance.ReplicationGroupName = m.GetString("replication_group_name") - instance.ReplicationGroupIsSinglePrimary = m.GetBool("replication_group_is_single_primary_mode") - instance.ReplicationGroupMemberState = m.GetString("replication_group_member_state") - instance.ReplicationGroupMemberRole = m.GetString("replication_group_member_role") - instance.ReplicationGroupPrimaryInstanceKey = InstanceKey{Hostname: m.GetString("replication_group_primary_host"), - Port: m.GetInt("replication_group_primary_port")} - _ = instance.ReplicationGroupMembers.ReadJSON(m.GetString("replication_group_members")) - //instance.ReplicationGroup = m.GetString("replication_group_") - // problems if !instance.IsLastCheckValid { instance.Problems = append(instance.Problems, "last_check_invalid") @@ -678,43 +568,27 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { if instance.GtidErrant != "" { instance.Problems = append(instance.Problems, "errant_gtid") } - // Group replication problems - if instance.ReplicationGroupName != "" && instance.ReplicationGroupMemberState != GroupReplicationMemberStateOnline { - instance.Problems = append(instance.Problems, "group_replication_member_not_online") - } return instance } // readInstancesByCondition is a generic function to read instances from the backend database func readInstancesByCondition(condition string, args []any, sort string) ([](*Instance), error) { - readFunc := func() ([](*Instance), error) { - instances := [](*Instance){} + readFunc := func() ([]*Instance, error) { + var instances []*Instance if sort == "" { - sort = `hostname, port` + sort = `alias` } query := fmt.Sprintf(` select *, unix_timestamp() - unix_timestamp(last_checked) as seconds_since_last_checked, ifnull(last_checked <= last_seen, 0) as is_last_check_valid, - unix_timestamp() - unix_timestamp(last_seen) as seconds_since_last_seen, - candidate_database_instance.last_suggested is not null - and candidate_database_instance.promotion_rule in ('must', 'prefer') as is_candidate, - ifnull(nullif(candidate_database_instance.promotion_rule, ''), 'neutral') as promotion_rule, - ifnull(unresolved_hostname, '') as unresolved_hostname, - (database_instance_downtime.downtime_active is not null and ifnull(database_instance_downtime.end_timestamp, now()) > now()) as is_downtimed, - ifnull(database_instance_downtime.reason, '') as downtime_reason, - ifnull(database_instance_downtime.owner, '') as downtime_owner, - ifnull(unix_timestamp() - unix_timestamp(begin_timestamp), 0) as elapsed_downtime_seconds, - ifnull(database_instance_downtime.end_timestamp, '') as downtime_end_timestamp + unix_timestamp() - unix_timestamp(last_seen) as seconds_since_last_seen from - database_instance - left join vitess_tablet using (hostname, port) - left join candidate_database_instance using (hostname, port) - left join hostname_unresolve using (hostname) - left join database_instance_downtime using (hostname, port) + vitess_tablet + left join database_instance using (alias, hostname, port) where %s order by @@ -738,19 +612,14 @@ func readInstancesByCondition(condition string, args []any, sort string) ([](*In return instances, err } -func readInstancesByExactKey(instanceKey *InstanceKey) ([](*Instance), error) { +// ReadInstance reads an instance from the vtorc backend database +func ReadInstance(tabletAlias string) (*Instance, bool, error) { condition := ` - hostname = ? - and port = ? + alias = ? ` - return readInstancesByCondition(condition, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), "") -} - -// ReadInstance reads an instance from the vtorc backend database -func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) { - instances, err := readInstancesByExactKey(instanceKey) - // We know there will be at most one (hostname & port are PK) - // And we expect to find one + instances, err := readInstancesByCondition(condition, sqlutils.Args(tabletAlias), "") + // We know there will be at most one (alias is the PK). + // And we expect to find one. readInstanceCounter.Inc(1) if len(instances) == 0 { return nil, false, err @@ -762,25 +631,25 @@ func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) { } // ReadReplicaInstances reads replicas of a given primary -func ReadReplicaInstances(primaryKey *InstanceKey) ([](*Instance), error) { +func ReadReplicaInstances(primaryHost string, primaryPort int) ([](*Instance), error) { condition := ` source_host = ? and source_port = ? ` - return readInstancesByCondition(condition, sqlutils.Args(primaryKey.Hostname, primaryKey.Port), "") + return readInstancesByCondition(condition, sqlutils.Args(primaryHost, primaryPort), "") } // ReadReplicaInstancesIncludingBinlogServerSubReplicas returns a list of direct slves including any replicas // of a binlog server replica -func ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryKey *InstanceKey) ([](*Instance), error) { - replicas, err := ReadReplicaInstances(primaryKey) +func ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryHost string, primaryPort int) ([](*Instance), error) { + replicas, err := ReadReplicaInstances(primaryHost, primaryPort) if err != nil { return replicas, err } for _, replica := range replicas { replica := replica if replica.IsBinlogServer() { - binlogServerReplicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(&replica.Key) + binlogServerReplicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(replica.Hostname, replica.Port) if err != nil { return replicas, err } @@ -803,140 +672,15 @@ func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) or (abs(cast(replication_lag_seconds as signed) - cast(sql_delay as signed)) > ?) or (abs(cast(replica_lag_seconds as signed) - cast(sql_delay as signed)) > ?) or (gtid_errant != '') - or (replication_group_name != '' and replication_group_member_state != 'ONLINE') ) ` args := sqlutils.Args(keyspace, keyspace, shard, shard, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) - instances, err := readInstancesByCondition(condition, args, "") - if err != nil { - return instances, err - } - var reportedInstances [](*Instance) - for _, instance := range instances { - skip := false - if instance.IsDowntimed { - skip = true - } - if !skip { - reportedInstances = append(reportedInstances, instance) - } - } - return reportedInstances, nil -} - -// ReadLostInRecoveryInstances returns all instances (potentially filtered by cluster) -// which are currently indicated as downtimed due to being lost during a topology recovery. -func ReadLostInRecoveryInstances(keyspace string, shard string) ([](*Instance), error) { - condition := ` - ifnull( - database_instance_downtime.downtime_active = 1 - and database_instance_downtime.end_timestamp > now() - and database_instance_downtime.reason = ?, 0) - and ? IN ('', keyspace) - and ? IN ('', shard) - ` - return readInstancesByCondition(condition, sqlutils.Args(DowntimeLostInRecoveryMessage, keyspace, shard), "keyspace asc, shard asc, replication_depth asc") -} - -// ForgetUnseenInstancesDifferentlyResolved will purge instances which are invalid, and whose hostname -// appears on the hostname_resolved table; this means some time in the past their hostname was unresovled, and now -// resovled to a different value; the old hostname is never accessed anymore and the old entry should be removed. -func ForgetUnseenInstancesDifferentlyResolved() error { - query := ` - select - database_instance.hostname, database_instance.port - from - hostname_resolve - JOIN database_instance ON (hostname_resolve.hostname = database_instance.hostname) - where - hostname_resolve.hostname != hostname_resolve.resolved_hostname - AND ifnull(last_checked <= last_seen, 0) = 0 - ` - keys := NewInstanceKeyMap() - err := db.QueryVTOrc(query, nil, func(m sqlutils.RowMap) error { - key := InstanceKey{ - Hostname: m.GetString("hostname"), - Port: m.GetInt("port"), - } - keys.AddKey(key) - return nil - }) - var rowsAffected int64 - for _, key := range keys.GetInstanceKeys() { - sqlResult, err := db.ExecVTOrc(` - delete from - database_instance - where - hostname = ? and port = ? - `, key.Hostname, key.Port, - ) - if err != nil { - log.Error(err) - return err - } - rows, err := sqlResult.RowsAffected() - if err != nil { - log.Error(err) - return err - } - rowsAffected = rowsAffected + rows - } - _ = AuditOperation("forget-unseen-differently-resolved", nil, fmt.Sprintf("Forgotten instances: %d", rowsAffected)) - return err -} - -// readUnknownPrimaryHostnameResolves will figure out the resolved hostnames of primary-hosts which cannot be found. -// It uses the hostname_resolve_history table to heuristically guess the correct hostname (based on "this was the -// last time we saw this hostname and it resolves into THAT") -func readUnknownPrimaryHostnameResolves() (map[string]string, error) { - res := make(map[string]string) - err := db.QueryVTOrcRowsMap(` - SELECT DISTINCT - replica_instance.source_host, hostname_resolve_history.resolved_hostname - FROM - database_instance replica_instance - LEFT JOIN hostname_resolve ON (replica_instance.source_host = hostname_resolve.hostname) - LEFT JOIN database_instance primary_instance ON ( - COALESCE(hostname_resolve.resolved_hostname, replica_instance.source_host) = primary_instance.hostname - and replica_instance.source_port = primary_instance.port - ) LEFT JOIN hostname_resolve_history ON (replica_instance.source_host = hostname_resolve_history.hostname) - WHERE - primary_instance.last_checked IS NULL - and replica_instance.source_host != '' - and replica_instance.source_host != '_' - and replica_instance.source_port > 0 - `, func(m sqlutils.RowMap) error { - res[m.GetString("source_host")] = m.GetString("resolved_hostname") - return nil - }) - if err != nil { - log.Error(err) - return res, err - } - - return res, nil -} - -// ResolveUnknownPrimaryHostnameResolves fixes missing hostname resolves based on hostname_resolve_history -// The use case is replicas replicating from some unknown-hostname which cannot be otherwise found. This could -// happen due to an expire unresolve together with clearing up of hostname cache. -func ResolveUnknownPrimaryHostnameResolves() error { - - hostnameResolves, err := readUnknownPrimaryHostnameResolves() - if err != nil { - return err - } - for hostname, resolvedHostname := range hostnameResolves { - UpdateResolvedHostname(hostname, resolvedHostname) - } - - _ = AuditOperation("resolve-unknown-primaries", nil, fmt.Sprintf("Num resolved hostnames: %d", len(hostnameResolves))) - return err + return readInstancesByCondition(condition, args, "") } // GetKeyspaceShardName gets the keyspace shard name for the given instance key -func GetKeyspaceShardName(instanceKey *InstanceKey) (keyspace string, shard string, err error) { +func GetKeyspaceShardName(tabletAlias string) (keyspace string, shard string, err error) { query := ` select keyspace, @@ -944,10 +688,9 @@ func GetKeyspaceShardName(instanceKey *InstanceKey) (keyspace string, shard stri from vitess_tablet where - hostname = ? - and port = ? + alias = ? ` - err = db.QueryVTOrc(query, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), func(m sqlutils.RowMap) error { + err = db.QueryVTOrc(query, sqlutils.Args(tabletAlias), func(m sqlutils.RowMap) error { keyspace = m.GetString("keyspace") shard = m.GetString("shard") return nil @@ -967,11 +710,11 @@ func GetKeyspaceShardName(instanceKey *InstanceKey) (keyspace string, shard stri // resulted in an actual check! This can happen when TCP/IP connections are hung, in which case the "check" // never returns. In such case we multiply interval by a factor, so as not to open too many connections on // the instance. -func ReadOutdatedInstanceKeys() ([]InstanceKey, error) { - res := []InstanceKey{} +func ReadOutdatedInstanceKeys() ([]string, error) { + var res []string query := ` SELECT - hostname, port + alias FROM database_instance WHERE @@ -982,24 +725,21 @@ func ReadOutdatedInstanceKeys() ([]InstanceKey, error) { END UNION SELECT - vitess_tablet.hostname, vitess_tablet.port + vitess_tablet.alias FROM vitess_tablet LEFT JOIN database_instance ON ( - vitess_tablet.hostname = database_instance.hostname - AND vitess_tablet.port = database_instance.port + vitess_tablet.alias = database_instance.alias ) WHERE - database_instance.hostname IS NULL + database_instance.alias IS NULL ` args := sqlutils.Args(config.Config.InstancePollSeconds, 2*config.Config.InstancePollSeconds) err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - instanceKey, merr := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) - if merr != nil { - log.Error(merr) - } else if !InstanceIsForgotten(instanceKey) { + tabletAlias := m.GetString("alias") + if !InstanceIsForgotten(tabletAlias) { // only if not in "forget" cache - res = append(res, *instanceKey) + res = append(res, tabletAlias) } // We don;t return an error because we want to keep filling the outdated instances list. return nil @@ -1066,6 +806,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo insertIgnore = true } var columns = []string{ + "alias", "hostname", "port", "last_checked", @@ -1127,24 +868,16 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo "semi_sync_primary_status", "semi_sync_primary_clients", "semi_sync_replica_status", - "instance_alias", "last_discovery_latency", - "replication_group_name", - "replication_group_is_single_primary_mode", - "replication_group_member_state", - "replication_group_member_role", - "replication_group_members", - "replication_group_primary_host", - "replication_group_primary_port", } var values = make([]string, len(columns)) for i := range columns { values[i] = "?" } - values[2] = "NOW()" // last_checked - values[3] = "NOW()" // last_attempted_check - values[4] = "1" // last_check_partial_success + values[3] = "NOW()" // last_checked + values[4] = "NOW()" // last_attempted_check + values[5] = "1" // last_check_partial_success if updateLastSeen { columns = append(columns, "last_seen") @@ -1155,8 +888,9 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo for _, instance := range instances { // number of columns minus 2 as last_checked and last_attempted_check // updated with NOW() - args = append(args, instance.Key.Hostname) - args = append(args, instance.Key.Port) + args = append(args, instance.InstanceAlias) + args = append(args, instance.Hostname) + args = append(args, instance.Port) args = append(args, instance.ServerID) args = append(args, instance.ServerUUID) args = append(args, instance.Version) @@ -1170,8 +904,8 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo args = append(args, instance.LogReplicationUpdatesEnabled) args = append(args, instance.SelfBinlogCoordinates.LogFile) args = append(args, instance.SelfBinlogCoordinates.LogPos) - args = append(args, instance.SourceKey.Hostname) - args = append(args, instance.SourceKey.Port) + args = append(args, instance.SourceHost) + args = append(args, instance.SourcePort) args = append(args, instance.ReplicationSQLThreadRuning) args = append(args, instance.ReplicationIOThreadRuning) args = append(args, instance.ReplicationSQLThreadState) @@ -1213,15 +947,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo args = append(args, instance.SemiSyncPrimaryStatus) args = append(args, instance.SemiSyncPrimaryClients) args = append(args, instance.SemiSyncReplicaStatus) - args = append(args, instance.InstanceAlias) args = append(args, instance.LastDiscoveryLatency.Nanoseconds()) - args = append(args, instance.ReplicationGroupName) - args = append(args, instance.ReplicationGroupIsSinglePrimary) - args = append(args, instance.ReplicationGroupMemberState) - args = append(args, instance.ReplicationGroupMemberRole) - args = append(args, instance.ReplicationGroupMembers.ToJSONString()) - args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Hostname) - args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Port) } sql, err := mkInsertOdku("database_instance", columns, values, len(instances), insertIgnore) @@ -1238,7 +964,7 @@ func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bo func writeManyInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) error { writeInstances := [](*Instance){} for _, instance := range instances { - if InstanceIsForgotten(&instance.Key) && !instance.IsSeed() { + if InstanceIsForgotten(instance.InstanceAlias) && !instance.IsSeed() { continue } writeInstances = append(writeInstances, instance) @@ -1267,7 +993,7 @@ func WriteInstance(instance *Instance, instanceWasActuallyFound bool, lastError // UpdateInstanceLastChecked updates the last_check timestamp in the vtorc backed database // for a given instance -func UpdateInstanceLastChecked(instanceKey *InstanceKey, partialSuccess bool) error { +func UpdateInstanceLastChecked(tabletAlias string, partialSuccess bool) error { writeFunc := func() error { _, err := db.ExecVTOrc(` update @@ -1276,11 +1002,9 @@ func UpdateInstanceLastChecked(instanceKey *InstanceKey, partialSuccess bool) er last_checked = NOW(), last_check_partial_success = ? where - hostname = ? - and port = ?`, + alias = ?`, partialSuccess, - instanceKey.Hostname, - instanceKey.Port, + tabletAlias, ) if err != nil { log.Error(err) @@ -1298,7 +1022,7 @@ func UpdateInstanceLastChecked(instanceKey *InstanceKey, partialSuccess bool) er // And so we make sure to note down *before* we even attempt to access the instance; and this raises a red flag when we // wish to access the instance again: if last_attempted_check is *newer* than last_checked, that's bad news and means // we have a "hanging" issue. -func UpdateInstanceLastAttemptedCheck(instanceKey *InstanceKey) error { +func UpdateInstanceLastAttemptedCheck(tabletAlias string) error { writeFunc := func() error { _, err := db.ExecVTOrc(` update @@ -1306,10 +1030,8 @@ func UpdateInstanceLastAttemptedCheck(instanceKey *InstanceKey) error { set last_attempted_check = NOW() where - hostname = ? - and port = ?`, - instanceKey.Hostname, - instanceKey.Port, + alias = ?`, + tabletAlias, ) if err != nil { log.Error(err) @@ -1319,43 +1041,59 @@ func UpdateInstanceLastAttemptedCheck(instanceKey *InstanceKey) error { return ExecDBWriteFunc(writeFunc) } -func InstanceIsForgotten(instanceKey *InstanceKey) bool { - _, found := forgetInstanceKeys.Get(instanceKey.StringCode()) +func InstanceIsForgotten(tabletAlias string) bool { + _, found := forgetAliases.Get(tabletAlias) return found } // ForgetInstance removes an instance entry from the vtorc backed database. // It may be auto-rediscovered through topology or requested for discovery by multiple means. -func ForgetInstance(instanceKey *InstanceKey) error { - if instanceKey == nil { - errMsg := "ForgetInstance(): nil instanceKey" +func ForgetInstance(tabletAlias string) error { + if tabletAlias == "" { + errMsg := "ForgetInstance(): empty tabletAlias" log.Errorf(errMsg) return fmt.Errorf(errMsg) } - forgetInstanceKeys.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) + forgetAliases.Set(tabletAlias, true, cache.DefaultExpiration) + log.Infof("Forgetting: %v", tabletAlias) + + // Delete from the 'vitess_tablet' table. + _, err := db.ExecVTOrc(` + delete + from vitess_tablet + where + alias = ?`, + tabletAlias, + ) + if err != nil { + log.Error(err) + return err + } + + // Also delete from the 'database_instance' table. sqlResult, err := db.ExecVTOrc(` delete from database_instance where - hostname = ? and port = ?`, - instanceKey.Hostname, - instanceKey.Port, + alias = ?`, + tabletAlias, ) if err != nil { log.Error(err) return err } + // Get the number of rows affected. If they are zero, then we tried to forget an instance that doesn't exist. rows, err := sqlResult.RowsAffected() if err != nil { log.Error(err) return err } if rows == 0 { - errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", *instanceKey) + errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", tabletAlias) log.Errorf(errMsg) return fmt.Errorf(errMsg) } - _ = AuditOperation("forget", instanceKey, "") + _ = AuditOperation("forget", tabletAlias, "") return nil } @@ -1377,7 +1115,7 @@ func ForgetLongUnseenInstances() error { log.Error(err) return err } - _ = AuditOperation("forget-unseen", nil, fmt.Sprintf("Forgotten instances: %d", rows)) + _ = AuditOperation("forget-unseen", "", fmt.Sprintf("Forgotten instances: %d", rows)) return err } @@ -1387,12 +1125,14 @@ func SnapshotTopologies() error { _, err := db.ExecVTOrc(` insert ignore into database_instance_topology_history (snapshot_unix_timestamp, - hostname, port, source_host, source_port, version) + alias, hostname, port, source_host, source_port, keyspace, shard, version) select UNIX_TIMESTAMP(NOW()), - hostname, port, source_host, source_port, version + vitess_tablet.alias, vitess_tablet.hostname, vitess_tablet.port, + database_instance.source_host, database_instance.source_port, + vitess_tablet.keyspace, vitess_tablet.shard, database_instance.version from - database_instance + vitess_tablet left join database_instance using (alias, hostname, port) `, ) if err != nil { @@ -1406,16 +1146,16 @@ func SnapshotTopologies() error { } // RecordStaleInstanceBinlogCoordinates snapshots the binlog coordinates of instances -func RecordStaleInstanceBinlogCoordinates(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) error { +func RecordStaleInstanceBinlogCoordinates(tabletAlias string, binlogCoordinates *BinlogCoordinates) error { args := sqlutils.Args( - instanceKey.Hostname, instanceKey.Port, + tabletAlias, binlogCoordinates.LogFile, binlogCoordinates.LogPos, ) _, err := db.ExecVTOrc(` delete from database_instance_stale_binlog_coordinates where - hostname=? and port=? + alias = ? and ( binary_log_file != ? or binary_log_pos != ? @@ -1430,10 +1170,10 @@ func RecordStaleInstanceBinlogCoordinates(instanceKey *InstanceKey, binlogCoordi _, err = db.ExecVTOrc(` insert ignore into database_instance_stale_binlog_coordinates ( - hostname, port, binary_log_file, binary_log_pos, first_seen + alias, binary_log_file, binary_log_pos, first_seen ) values ( - ?, ?, ?, ?, NOW() + ?, ?, ?, NOW() )`, args...) if err != nil { diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 71d0ed94ff9..66393c50448 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -6,19 +6,18 @@ import ( "regexp" "strings" "testing" + "time" + "github.com/patrickmn/go-cache" "github.com/stretchr/testify/require" + "vitess.io/vitess/go/vt/external/golib/sqlutils" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" ) -var ( - i710k = InstanceKey{Hostname: "i710", Port: 3306} - i720k = InstanceKey{Hostname: "i720", Port: 3306} - i730k = InstanceKey{Hostname: "i730", Port: 3306} -) - var ( spacesRegexp = regexp.MustCompile(`[ \t\n\r]+`) ) @@ -36,9 +35,9 @@ func stripSpaces(s string) string { } func mkTestInstances() []*Instance { - i710 := Instance{Key: i710k, ServerID: 710, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 10}} - i720 := Instance{Key: i720k, ServerID: 720, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 20}} - i730 := Instance{Key: i730k, ServerID: 730, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 30}} + i710 := Instance{InstanceAlias: "zone1-i710", Hostname: "i710", Port: 3306, ServerID: 710, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 10}} + i720 := Instance{InstanceAlias: "zone1-i720", Hostname: "i720", Port: 3306, ServerID: 720, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 20}} + i730 := Instance{InstanceAlias: "zone1-i730", Hostname: "i730", Port: 3306, ServerID: 730, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 30}} instances := []*Instance{&i710, &i720, &i730} for _, instance := range instances { instance.Version = "5.6.7" @@ -59,21 +58,21 @@ func TestMkInsertOdkuSingle(t *testing.T) { // one instance s1 := `INSERT ignore INTO database_instance - (hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, - version, major_version, version_comment, binlog_server, read_only, binlog_format, - binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, - replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, - source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) - VALUES - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) - ON DUPLICATE KEY UPDATE - hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), - semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), - instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) - ` - a1 := `i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, + (alias, hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, + version, major_version, version_comment, binlog_server, read_only, binlog_format, + binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, + replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, + source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, last_seen) + VALUES + (?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + ON DUPLICATE KEY UPDATE + alias=VALUES(alias), hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), + semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), + last_discovery_latency=VALUES(last_discovery_latency), last_seen=VALUES(last_seen) + ` + a1 := `zone1-i710, i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, - false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, ` + false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0,` sql1, args1, err := mkInsertOdkuForInstances(instances[:1], false, true) require.NoError(t, err) @@ -86,22 +85,25 @@ func TestMkInsertOdkuThree(t *testing.T) { // three instances s3 := `INSERT INTO database_instance - (hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, version, major_version, version_comment, binlog_server, read_only, binlog_format, binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, - semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) - VALUES - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), - (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) - ON DUPLICATE KEY UPDATE - hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), - physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced), - semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), - instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) - ` + (alias, hostname, port, last_checked, last_attempted_check, last_check_partial_success, server_id, server_uuid, + version, major_version, version_comment, binlog_server, read_only, binlog_format, + binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, + replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, + source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, last_seen) + VALUES + (?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + ON DUPLICATE KEY UPDATE + alias=VALUES(alias), hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_replica_updates=VALUES(log_replica_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), source_host=VALUES(source_host), source_port=VALUES(source_port), replica_sql_running=VALUES(replica_sql_running), replica_io_running=VALUES(replica_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), source_uuid=VALUES(source_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), source_log_file=VALUES(source_log_file), read_source_log_pos=VALUES(read_source_log_pos), relay_source_log_file=VALUES(relay_source_log_file), exec_source_log_pos=VALUES(exec_source_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), replication_lag_seconds=VALUES(replication_lag_seconds), replica_lag_seconds=VALUES(replica_lag_seconds), sql_delay=VALUES(sql_delay), data_center=VALUES(data_center), region=VALUES(region), + physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_primary=VALUES(is_co_primary), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced), + semi_sync_primary_enabled=VALUES(semi_sync_primary_enabled), semi_sync_primary_timeout=VALUES(semi_sync_primary_timeout), semi_sync_primary_wait_for_replica_count=VALUES(semi_sync_primary_wait_for_replica_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_primary_status=VALUES(semi_sync_primary_status), semi_sync_primary_clients=VALUES(semi_sync_primary_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), + last_discovery_latency=VALUES(last_discovery_latency), last_seen=VALUES(last_seen) + ` a3 := ` - i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, - i720, 3306, 720, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 20, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, - i730, 3306, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + zone1-i710, i710, 3306, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0, + zone1-i720, i720, 3306, 720, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 20, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0, + zone1-i730, i730, 3306, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, , , , 0, false, false, false, false, false, 0, 0, false, false, 0, false, 0, ` sql3, args3, err := mkInsertOdkuForInstances(instances[:3], true, true) @@ -147,11 +149,482 @@ func TestGetKeyspaceShardName(t *testing.T) { err = SaveTablet(tab100) require.NoError(t, err) - keyspaceRead, shardRead, err := GetKeyspaceShardName(&InstanceKey{ - Hostname: hostname, - Port: int(port), - }) + keyspaceRead, shardRead, err := GetKeyspaceShardName(topoproto.TabletAliasString(tab100.Alias)) require.NoError(t, err) require.Equal(t, ks, keyspaceRead) require.Equal(t, shard, shardRead) } + +// TestReadInstance is used to test the functionality of ReadInstance and verify its failure modes and successes. +func TestReadInstance(t *testing.T) { + tests := []struct { + name string + tabletAliasToRead string + instanceFound bool + }{ + { + name: "Read success", + tabletAliasToRead: "zone1-0000000100", + instanceFound: true, + }, { + name: "Unknown tablet", + tabletAliasToRead: "unknown-tablet", + instanceFound: false, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + got, found, err := ReadInstance(tt.tabletAliasToRead) + require.NoError(t, err) + require.Equal(t, tt.instanceFound, found) + if tt.instanceFound { + require.EqualValues(t, tt.tabletAliasToRead, got.InstanceAlias) + } + }) + } +} + +// TestReadReplicaInstances is used to test the functionality of ReadReplicaInstances and verify its failure modes and successes. +func TestReadReplicaInstances(t *testing.T) { + tests := []struct { + name string + tabletPort int + replicasLen int + }{ + { + name: "Read success - Multiple replicas", + // This tabletPort corresponds to zone1-0000000101. That is the primary for the data inserted. + // Check initialSQL for more details. + tabletPort: 6714, + replicasLen: 3, + }, { + name: "Unknown tablet", + // This tabletPort corresponds to none of the tablets. + // Check initialSQL for more details. + tabletPort: 343, + replicasLen: 0, + }, { + name: "Read success - No replicas", + // This tabletPort corresponds to zone1-0000000100. That is a replica tablet, with no replicas of its own. + // Check initialSQL for more details. + tabletPort: 6711, + replicasLen: 0, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + instances, err := ReadReplicaInstances("localhost", tt.tabletPort) + require.NoError(t, err) + require.EqualValues(t, tt.replicasLen, len(instances)) + }) + } +} + +// TestReadProblemInstances is used to test the functionality of ReadProblemInstances and verify its failure modes and successes. +func TestReadProblemInstances(t *testing.T) { + // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. + tests := []struct { + name string + sql []string + instancesRequired []string + }{ + { + name: "No problems", + sql: nil, + instancesRequired: nil, + }, { + name: "Replication stopped on a replica", + sql: []string{ + "update database_instance set replication_sql_thread_state = 0 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "IO thread stopped on a replica", + sql: []string{ + "update database_instance set replication_io_thread_state = 0 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "High replication lag", + sql: []string{ + "update database_instance set replication_lag_seconds = 1000 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "High replication lag - replica_lag", + sql: []string{ + "update database_instance set replica_lag_seconds = 1000 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "errant GTID", + sql: []string{ + "update database_instance set gtid_errant = '729a4cc4-8680-11ed-a104-47706090afbd:1' where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "Many failures", + sql: []string{ + "update database_instance set gtid_errant = '729a4cc4-8680-11ed-a104-47706090afbd:1' where alias = 'zone1-0000000112'", + "update database_instance set replication_sql_thread_state = 0 where alias = 'zone1-0000000100'", + }, + instancesRequired: []string{"zone1-0000000112", "zone1-0000000100"}, + }, + } + + // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. + // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. + oldVal := config.Config.InstancePollSeconds + defer func() { + config.Config.InstancePollSeconds = oldVal + }() + config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range append(initialSQL, tt.sql...) { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + instances, err := ReadProblemInstances("ks", "0") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.ElementsMatch(t, tabletAliases, tt.instancesRequired) + }) + } +} + +// TestReadInstancesByCondition is used to test the functionality of readInstancesByCondition and verify its failure modes and successes. +func TestReadInstancesByCondition(t *testing.T) { + tests := []struct { + name string + condition string + args []any + sort string + instancesRequired []string + }{ + { + name: "All instances with no sort", + condition: "1=1", + instancesRequired: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "All instances sort by data_center descending and then alias ascending", + condition: "1=1", + sort: "data_center desc, alias asc", + instancesRequired: []string{"zone2-0000000200", "zone1-0000000100", "zone1-0000000101", "zone1-0000000112"}, + }, { + name: "Filtering by replication_depth", + condition: "replication_depth=1", + instancesRequired: []string{"zone1-0000000100", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Filtering by exact alias", + condition: "alias='zone1-0000000100'", + instancesRequired: []string{"zone1-0000000100"}, + }, { + name: "No qualifying tablets", + condition: "replication_depth=15", + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + instances, err := readInstancesByCondition(tt.condition, tt.args, tt.sort) + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.EqualValues(t, tt.instancesRequired, tabletAliases) + }) + } +} + +// TestReadOutdatedInstanceKeys is used to test the functionality of ReadOutdatedInstanceKeys and verify its failure modes and successes. +func TestReadOutdatedInstanceKeys(t *testing.T) { + // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. + tests := []struct { + name string + sql []string + instancesRequired []string + }{ + { + name: "No problems", + sql: []string{"update database_instance set last_checked = now()"}, + instancesRequired: nil, + }, { + name: "One instance is outdated", + sql: []string{ + "update database_instance set last_checked = now()", + "update database_instance set last_checked = time(now(), '-1 hour') where alias = 'zone1-0000000100'", + }, + instancesRequired: []string{"zone1-0000000100"}, + }, { + name: "One instance doesn't have myql data", + sql: []string{ + "update database_instance set last_checked = now()", + `INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`, + }, + instancesRequired: []string{"zone1-0000000103"}, + }, { + name: "One instance doesn't have myql data and one is outdated", + sql: []string{ + "update database_instance set last_checked = now()", + "update database_instance set last_checked = time(now(), '-1 hour') where alias = 'zone1-0000000100'", + `INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`, + }, + instancesRequired: []string{"zone1-0000000103", "zone1-0000000100"}, + }, + } + + // We are setting InstancePollSeconds to 59 minutes, just for the test. + oldVal := config.Config.InstancePollSeconds + defer func() { + config.Config.InstancePollSeconds = oldVal + }() + config.Config.InstancePollSeconds = 60 * 59 + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range append(initialSQL, tt.sql...) { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + tabletAliases, err := ReadOutdatedInstanceKeys() + require.NoError(t, err) + require.ElementsMatch(t, tabletAliases, tt.instancesRequired) + }) + } +} + +// TestUpdateInstanceLastChecked is used to test the functionality of UpdateInstanceLastChecked and verify its failure modes and successes. +func TestUpdateInstanceLastChecked(t *testing.T) { + tests := []struct { + name string + tabletAlias string + partialSuccess bool + conditionToCheck string + }{ + { + name: "Verify updated last checked", + tabletAlias: "zone1-0000000100", + partialSuccess: false, + conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = false", + }, { + name: "Verify partial success", + tabletAlias: "zone1-0000000100", + partialSuccess: true, + conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = true", + }, { + name: "Verify no error on unknown tablet", + tabletAlias: "unknown tablet", + partialSuccess: true, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := UpdateInstanceLastChecked(tt.tabletAlias, tt.partialSuccess) + require.NoError(t, err) + + if tt.conditionToCheck != "" { + // Verify the instance we just updated satisfies the condition specified. + instances, err := readInstancesByCondition(tt.conditionToCheck, nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.Contains(t, tabletAliases, tt.tabletAlias) + } + }) + } +} + +// UpdateInstanceLastAttemptedCheck is used to test the functionality of UpdateInstanceLastAttemptedCheck and verify its failure modes and successes. +func TestUpdateInstanceLastAttemptedCheck(t *testing.T) { + tests := []struct { + name string + tabletAlias string + conditionToCheck string + }{ + { + name: "Verify updated last checked", + tabletAlias: "zone1-0000000100", + conditionToCheck: "last_attempted_check >= now() - interval 30 second", + }, { + name: "Verify no error on unknown tablet", + tabletAlias: "unknown tablet", + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := UpdateInstanceLastAttemptedCheck(tt.tabletAlias) + require.NoError(t, err) + + if tt.conditionToCheck != "" { + // Verify the instance we just updated satisfies the condition specified. + instances, err := readInstancesByCondition(tt.conditionToCheck, nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.Contains(t, tabletAliases, tt.tabletAlias) + } + }) + } +} + +// TestForgetInstanceAndInstanceIsForgotten tests the functionality of ForgetInstance and InstanceIsForgotten together. +func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) { + tests := []struct { + name string + tabletAlias string + errExpected string + instanceForgotten bool + tabletsExpected []string + }{ + { + name: "Unknown tablet", + tabletAlias: "unknown-tablet", + errExpected: "ForgetInstance(): instance unknown-tablet not found", + instanceForgotten: true, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Empty tabletAlias", + tabletAlias: "", + errExpected: "ForgetInstance(): empty tabletAlias", + instanceForgotten: false, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Success", + tabletAlias: "zone1-0000000112", + instanceForgotten: true, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone2-0000000200"}, + }, + } + + oldCache := forgetAliases + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + forgetAliases = oldCache + db.ClearVTOrcDatabase() + }() + forgetAliases = cache.New(time.Minute, time.Minute) + + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := ForgetInstance(tt.tabletAlias) + if tt.errExpected != "" { + require.EqualError(t, err, tt.errExpected) + } else { + require.NoError(t, err) + } + isForgotten := InstanceIsForgotten(tt.tabletAlias) + require.Equal(t, tt.instanceForgotten, isForgotten) + + instances, err := readInstancesByCondition("1=1", nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.EqualValues(t, tt.tabletsExpected, tabletAliases) + }) + } +} + +func TestSnapshotTopologies(t *testing.T) { + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + err := SnapshotTopologies() + require.NoError(t, err) + + query := "select alias from database_instance_topology_history" + var tabletAliases []string + err = db.QueryVTOrc(query, nil, func(rowMap sqlutils.RowMap) error { + tabletAliases = append(tabletAliases, rowMap.GetString("alias")) + return nil + }) + require.NoError(t, err) + + require.Equal(t, []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, tabletAliases) +} diff --git a/go/vt/vtorc/inst/instance_key.go b/go/vt/vtorc/inst/instance_key.go deleted file mode 100644 index 2a3124aeb57..00000000000 --- a/go/vt/vtorc/inst/instance_key.go +++ /dev/null @@ -1,189 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - "regexp" - "strconv" - "strings" -) - -// InstanceKey is an instance indicator, identifued by hostname and port -type InstanceKey struct { - Hostname string - Port int -} - -var ( - ipv4Regexp = regexp.MustCompile(`^([0-9]+)[.]([0-9]+)[.]([0-9]+)[.]([0-9]+)$`) - ipv4HostPortRegexp = regexp.MustCompile(`^([^:]+):([0-9]+)$`) - ipv4HostRegexp = regexp.MustCompile(`^([^:]+)$`) - ipv6HostPortRegexp = regexp.MustCompile(`^\[([:0-9a-fA-F]+)\]:([0-9]+)$`) // e.g. [2001:db8:1f70::999:de8:7648:6e8]:3308 - ipv6HostRegexp = regexp.MustCompile(`^([:0-9a-fA-F]+)$`) // e.g. 2001:db8:1f70::999:de8:7648:6e8 -) - -const detachHint = "//" - -func newInstanceKey(hostname string, port int, resolve bool) (instanceKey *InstanceKey, err error) { - if hostname == "" { - return instanceKey, fmt.Errorf("NewResolveInstanceKey: Empty hostname") - } - - instanceKey = &InstanceKey{Hostname: hostname, Port: port} - if resolve { - instanceKey, err = instanceKey.ResolveHostname() - } - return instanceKey, err -} - -// newInstanceKeyStrings -func newInstanceKeyStrings(hostname string, port string, resolve bool) (*InstanceKey, error) { - portInt, err := strconv.Atoi(port) - if err != nil { - return nil, fmt.Errorf("Invalid port: %s", port) - } - return newInstanceKey(hostname, portInt, resolve) -} - -func parseRawInstanceKey(hostPort string, resolve bool) (instanceKey *InstanceKey, err error) { - hostname := "" - port := "" - if submatch := ipv4HostPortRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { - hostname = submatch[1] - port = submatch[2] - } else if submatch := ipv4HostRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { - hostname = submatch[1] - } else if submatch := ipv6HostPortRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { - hostname = submatch[1] - port = submatch[2] - } else if submatch := ipv6HostRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { - hostname = submatch[1] - } else { - return nil, fmt.Errorf("Cannot parse address: %s", hostPort) - } - if port == "" { - port = "3306" - } - return newInstanceKeyStrings(hostname, port, resolve) -} - -func NewResolveInstanceKey(hostname string, port int) (instanceKey *InstanceKey, err error) { - return newInstanceKey(hostname, port, true) -} - -// NewResolveInstanceKeyStrings creates and resolves a new instance key based on string params -func NewResolveInstanceKeyStrings(hostname string, port string) (*InstanceKey, error) { - return newInstanceKeyStrings(hostname, port, true) -} - -func ParseResolveInstanceKey(hostPort string) (instanceKey *InstanceKey, err error) { - return parseRawInstanceKey(hostPort, true) -} - -func ParseRawInstanceKey(hostPort string) (instanceKey *InstanceKey, err error) { - return parseRawInstanceKey(hostPort, false) -} - -// NewResolveInstanceKeyStrings creates and resolves a new instance key based on string params -func NewRawInstanceKeyStrings(hostname string, port string) (*InstanceKey, error) { - return newInstanceKeyStrings(hostname, port, false) -} - -func (instanceKey *InstanceKey) ResolveHostname() (*InstanceKey, error) { - if !instanceKey.IsValid() { - return instanceKey, nil - } - - hostname, err := ResolveHostname(instanceKey.Hostname) - if err == nil { - instanceKey.Hostname = hostname - } - return instanceKey, err -} - -// Equals tests equality between this key and another key -func (instanceKey *InstanceKey) Equals(other *InstanceKey) bool { - if other == nil { - return false - } - return instanceKey.Hostname == other.Hostname && instanceKey.Port == other.Port -} - -// SmallerThan returns true if this key is dictionary-smaller than another. -// This is used for consistent sorting/ordering; there's nothing magical about it. -func (instanceKey *InstanceKey) SmallerThan(other *InstanceKey) bool { - if instanceKey.Hostname < other.Hostname { - return true - } - if instanceKey.Hostname == other.Hostname && instanceKey.Port < other.Port { - return true - } - return false -} - -// IsDetached returns 'true' when this hostname is logically "detached" -func (instanceKey *InstanceKey) IsDetached() bool { - return strings.HasPrefix(instanceKey.Hostname, detachHint) -} - -// IsValid uses simple heuristics to see whether this key represents an actual instance -func (instanceKey *InstanceKey) IsValid() bool { - if instanceKey.Hostname == "_" { - return false - } - if instanceKey.IsDetached() { - return false - } - return len(instanceKey.Hostname) > 0 && instanceKey.Port > 0 -} - -// DetachedKey returns an instance key whose hostname is detahced: invalid, but recoverable -func (instanceKey *InstanceKey) DetachedKey() *InstanceKey { - if instanceKey.IsDetached() { - return instanceKey - } - return &InstanceKey{Hostname: fmt.Sprintf("%s%s", detachHint, instanceKey.Hostname), Port: instanceKey.Port} -} - -// ReattachedKey returns an instance key whose hostname is detahced: invalid, but recoverable -func (instanceKey *InstanceKey) ReattachedKey() *InstanceKey { - if !instanceKey.IsDetached() { - return instanceKey - } - return &InstanceKey{Hostname: instanceKey.Hostname[len(detachHint):], Port: instanceKey.Port} -} - -// StringCode returns an official string representation of this key -func (instanceKey *InstanceKey) StringCode() string { - return fmt.Sprintf("%s:%d", instanceKey.Hostname, instanceKey.Port) -} - -// DisplayString returns a user-friendly string representation of this key -func (instanceKey *InstanceKey) DisplayString() string { - return instanceKey.StringCode() -} - -// String returns a user-friendly string representation of this key -func (instanceKey InstanceKey) String() string { - return instanceKey.StringCode() -} - -// IsValid uses simple heuristics to see whether this key represents an actual instance -func (instanceKey *InstanceKey) IsIPv4() bool { - return ipv4Regexp.MatchString(instanceKey.Hostname) -} diff --git a/go/vt/vtorc/inst/instance_key_map.go b/go/vt/vtorc/inst/instance_key_map.go deleted file mode 100644 index 15d21151f12..00000000000 --- a/go/vt/vtorc/inst/instance_key_map.go +++ /dev/null @@ -1,141 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "encoding/json" - "sort" - "strings" -) - -// InstanceKeyMap is a convenience struct for listing InstanceKey-s -type InstanceKeyMap map[InstanceKey]bool - -func NewInstanceKeyMap() *InstanceKeyMap { - return &InstanceKeyMap{} -} - -// AddKey adds a single key to this map -func (instanceKeyMap *InstanceKeyMap) AddKey(key InstanceKey) { - (*instanceKeyMap)[key] = true -} - -// AddKeys adds all given keys to this map -func (instanceKeyMap *InstanceKeyMap) AddKeys(keys []InstanceKey) { - for _, key := range keys { - instanceKeyMap.AddKey(key) - } -} - -// AddInstances adds keys of all given instances to this map -func (instanceKeyMap *InstanceKeyMap) AddInstances(instances [](*Instance)) { - for _, instance := range instances { - instanceKeyMap.AddKey(instance.Key) - } -} - -// HasKey checks if given key is within the map -func (instanceKeyMap *InstanceKeyMap) HasKey(key InstanceKey) bool { - _, ok := (*instanceKeyMap)[key] - return ok -} - -// GetInstanceKeys returns keys in this map in the form of an array -func (instanceKeyMap *InstanceKeyMap) GetInstanceKeys() []InstanceKey { - res := []InstanceKey{} - for key := range *instanceKeyMap { - res = append(res, key) - } - sort.Slice(res, func(i, j int) bool { - return res[i].Hostname < res[j].Hostname || res[i].Hostname == res[j].Hostname && res[i].Port < res[j].Port - }) - return res -} - -// Intersect returns a keymap which is the intersection of this and another map -func (instanceKeyMap *InstanceKeyMap) Intersect(other *InstanceKeyMap) *InstanceKeyMap { - intersected := NewInstanceKeyMap() - for key := range *other { - if instanceKeyMap.HasKey(key) { - intersected.AddKey(key) - } - } - return intersected -} - -// MarshalJSON will marshal this map as JSON -func (instanceKeyMap InstanceKeyMap) MarshalJSON() ([]byte, error) { - return json.Marshal(instanceKeyMap.GetInstanceKeys()) -} - -// UnmarshalJSON reds this object from JSON -func (instanceKeyMap *InstanceKeyMap) UnmarshalJSON(b []byte) error { - var keys []InstanceKey - if err := json.Unmarshal(b, &keys); err != nil { - return err - } - *instanceKeyMap = make(InstanceKeyMap) - for _, key := range keys { - instanceKeyMap.AddKey(key) - } - return nil -} - -// ToJSON will marshal this map as JSON -func (instanceKeyMap *InstanceKeyMap) ToJSON() (string, error) { - bytes, err := instanceKeyMap.MarshalJSON() - return string(bytes), err -} - -// ToJSONString will marshal this map as JSON -func (instanceKeyMap *InstanceKeyMap) ToJSONString() string { - s, _ := instanceKeyMap.ToJSON() - return s -} - -// ToCommaDelimitedList will export this map in comma delimited format -func (instanceKeyMap *InstanceKeyMap) ToCommaDelimitedList() string { - keyDisplays := []string{} - for key := range *instanceKeyMap { - keyDisplays = append(keyDisplays, key.DisplayString()) - } - return strings.Join(keyDisplays, ",") -} - -// ReadJSON unmarshalls a json into this map -func (instanceKeyMap *InstanceKeyMap) ReadJSON(jsonString string) error { - var keys []InstanceKey - err := json.Unmarshal([]byte(jsonString), &keys) - if err != nil { - return err - } - instanceKeyMap.AddKeys(keys) - return err -} - -// ReadJSON unmarshalls a json into this map -func (instanceKeyMap *InstanceKeyMap) ReadCommaDelimitedList(list string) error { - tokens := strings.Split(list, ",") - for _, token := range tokens { - key, err := ParseResolveInstanceKey(token) - if err != nil { - return err - } - instanceKeyMap.AddKey(*key) - } - return nil -} diff --git a/go/vt/vtorc/inst/instance_key_map_test.go b/go/vt/vtorc/inst/instance_key_map_test.go deleted file mode 100644 index a390ef99532..00000000000 --- a/go/vt/vtorc/inst/instance_key_map_test.go +++ /dev/null @@ -1,125 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "math/rand" - "testing" - - "github.com/stretchr/testify/require" - - "vitess.io/vitess/go/vt/vtorc/config" -) - -func init() { - config.MarkConfigurationLoaded() -} - -func TestGetInstanceKeys(t *testing.T) { - for range rand.Perm(10) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. - m := *NewInstanceKeyMap() - m.AddKey(key1) - m.AddKey(key2) - keys := m.GetInstanceKeys() - require.Equal(t, keys[0], key1) - require.Equal(t, keys[1], key2) - } - for range rand.Perm(10) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. - m := *NewInstanceKeyMap() - m.AddKey(key2) - m.AddKey(key1) - keys := m.GetInstanceKeys() - require.Equal(t, keys[0], key1) - require.Equal(t, keys[1], key2) - } -} - -func TestInstanceKeyMapToJSON(t *testing.T) { - m := *NewInstanceKeyMap() - m.AddKey(key1) - m.AddKey(key2) - json, err := m.ToJSON() - require.NoError(t, err) - ok := (json == `[{"Hostname":"host1","Port":3306},{"Hostname":"host2","Port":3306}]`) || (json == `[{"Hostname":"host2","Port":3306},{"Hostname":"host1","Port":3306}]`) - require.True(t, ok) -} - -func TestInstanceKeyMapReadJSON(t *testing.T) { - json := `[{"Hostname":"host1","Port":3306},{"Hostname":"host2","Port":3306}]` - m := *NewInstanceKeyMap() - _ = m.ReadJSON(json) - require.Equal(t, len(m), 2) - require.True(t, m[key1]) - require.True(t, m[key2]) -} - -func TestEmptyInstanceKeyMapToCommaDelimitedList(t *testing.T) { - m := *NewInstanceKeyMap() - res := m.ToCommaDelimitedList() - - require.Equal(t, res, "") -} - -func TestInstanceKeyMapToCommaDelimitedList(t *testing.T) { - m := *NewInstanceKeyMap() - m.AddKey(key1) - m.AddKey(key2) - res := m.ToCommaDelimitedList() - - ok := (res == `host1:3306,host2:3306`) || (res == `host2:3306,host1:3306`) - require.True(t, ok) -} - -func TestIntersect(t *testing.T) { - { - m := NewInstanceKeyMap() - m.AddKey(key1) - m.AddKey(key2) - - other := NewInstanceKeyMap() - other.AddKey(key3) - other.AddKey(key2) - - intersected := m.Intersect(other) - require.Equal(t, len(*intersected), 1) - } - { - m := NewInstanceKeyMap() - m.AddKey(key1) - - other := NewInstanceKeyMap() - other.AddKey(key3) - other.AddKey(key2) - - intersected := m.Intersect(other) - require.Equal(t, len(*intersected), 0) - } - { - m := NewInstanceKeyMap() - m.AddKey(key1) - m.AddKey(key2) - - other := NewInstanceKeyMap() - other.AddKey(key1) - other.AddKey(key3) - other.AddKey(key2) - - intersected := m.Intersect(other) - require.Equal(t, len(*intersected), 2) - } - -} diff --git a/go/vt/vtorc/inst/instance_key_test.go b/go/vt/vtorc/inst/instance_key_test.go deleted file mode 100644 index 1374aad570e..00000000000 --- a/go/vt/vtorc/inst/instance_key_test.go +++ /dev/null @@ -1,209 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "testing" - - "github.com/stretchr/testify/require" - - "vitess.io/vitess/go/vt/vtorc/config" -) - -func init() { - config.MarkConfigurationLoaded() -} - -var key1 = InstanceKey{Hostname: "host1", Port: 3306} -var key2 = InstanceKey{Hostname: "host2", Port: 3306} -var key3 = InstanceKey{Hostname: "host3", Port: 3306} - -func TestInstanceKeyEquals(t *testing.T) { - i1 := Instance{ - Key: InstanceKey{ - Hostname: "sql00.db", - Port: 3306, - }, - Version: "5.6", - } - i2 := Instance{ - Key: InstanceKey{ - Hostname: "sql00.db", - Port: 3306, - }, - Version: "5.5", - } - - require.Equal(t, i1.Key, i2.Key) - - i2.Key.Port = 3307 - require.NotEqual(t, i1.Key, i2.Key) -} - -func TestNewResolveInstanceKey(t *testing.T) { - { - i, err := NewResolveInstanceKey("127.0.0.1", 3308) - require.NoError(t, err) - require.Equal(t, i.Hostname, "127.0.0.1") - require.Equal(t, i.Port, 3308) - } - { - _, err := NewResolveInstanceKey("", 3309) - require.Error(t, err) - } - { - i, err := NewResolveInstanceKey("127.0.0.1", 0) - require.NoError(t, err) - require.False(t, i.IsValid()) - } -} - -func TestParseResolveInstanceKey(t *testing.T) { - { - key, err := ParseResolveInstanceKey("myhost:1234") - require.NoError(t, err) - require.Equal(t, key.Hostname, "myhost") - require.Equal(t, key.Port, 1234) - } - { - key, err := ParseResolveInstanceKey("myhost") - require.NoError(t, err) - require.Equal(t, key.Hostname, "myhost") - require.Equal(t, key.Port, 3306) - } - { - key, err := ParseResolveInstanceKey("10.0.0.3:3307") - require.NoError(t, err) - require.Equal(t, key.Hostname, "10.0.0.3") - require.Equal(t, key.Port, 3307) - } - { - key, err := ParseResolveInstanceKey("10.0.0.3") - require.NoError(t, err) - require.Equal(t, key.Hostname, "10.0.0.3") - require.Equal(t, key.Port, 3306) - } - { - key, err := ParseResolveInstanceKey("[2001:db8:1f70::999:de8:7648:6e8]:3308") - require.NoError(t, err) - require.Equal(t, key.Hostname, "2001:db8:1f70::999:de8:7648:6e8") - require.Equal(t, key.Port, 3308) - } - { - key, err := ParseResolveInstanceKey("::1") - require.NoError(t, err) - require.Equal(t, key.Hostname, "::1") - require.Equal(t, key.Port, 3306) - } - { - key, err := ParseResolveInstanceKey("0:0:0:0:0:0:0:0") - require.NoError(t, err) - require.Equal(t, key.Hostname, "0:0:0:0:0:0:0:0") - require.Equal(t, key.Port, 3306) - } - { - _, err := ParseResolveInstanceKey("[2001:xxxx:1f70::999:de8:7648:6e8]:3308") - require.Error(t, err) - } - { - _, err := ParseResolveInstanceKey("10.0.0.4:") - require.Error(t, err) - } - { - _, err := ParseResolveInstanceKey("10.0.0.4:5.6.7") - require.Error(t, err) - } -} - -func TestNewResolveInstanceKeyStrings(t *testing.T) { - { - i, err := NewResolveInstanceKeyStrings("127.0.0.1", "3306") - require.NoError(t, err) - require.Equal(t, i.Hostname, "127.0.0.1") - require.Equal(t, i.Port, 3306) - } - { - _, err := NewResolveInstanceKeyStrings("127.0.0.1", "") - require.Error(t, err) - } - { - _, err := NewResolveInstanceKeyStrings("127.0.0.1", "3306x") - require.Error(t, err) - } -} - -func TestInstanceKeyValid(t *testing.T) { - require.True(t, key1.IsValid()) - i, err := ParseResolveInstanceKey("_:3306") - require.NoError(t, err) - require.False(t, i.IsValid()) - i, err = ParseResolveInstanceKey("//myhost:3306") - require.NoError(t, err) - require.False(t, i.IsValid()) -} - -func TestInstanceKeyDetach(t *testing.T) { - require.False(t, key1.IsDetached()) - detached1 := key1.DetachedKey() - require.True(t, detached1.IsDetached()) - detached2 := key1.DetachedKey() - require.True(t, detached2.IsDetached()) - require.True(t, detached1.Equals(detached2)) - - reattached1 := detached1.ReattachedKey() - require.False(t, reattached1.IsDetached()) - require.True(t, reattached1.Equals(&key1)) - reattached2 := reattached1.ReattachedKey() - require.False(t, reattached2.IsDetached()) - require.True(t, reattached1.Equals(reattached2)) -} - -func TestIsIPv4(t *testing.T) { - require.False(t, key1.IsIPv4()) - { - k, _ := ParseRawInstanceKey("mysql-server-1:3306") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("mysql-server-1") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("my.sql.server.1") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("mysql-server-1:3306") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("127.0.0:3306") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("127::0::0::1:3306") - require.False(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("127.0.0.1:3306") - require.True(t, k.IsIPv4()) - } - { - k, _ := ParseRawInstanceKey("127.0.0.1") - require.True(t, k.IsIPv4()) - } -} diff --git a/go/vt/vtorc/inst/instance_test.go b/go/vt/vtorc/inst/instance_test.go index ebc2d9d0c89..8dba101a8dc 100644 --- a/go/vt/vtorc/inst/instance_test.go +++ b/go/vt/vtorc/inst/instance_test.go @@ -28,7 +28,7 @@ func init() { config.MarkConfigurationLoaded() } -var instance1 = Instance{Key: key1} +var instance1 = Instance{InstanceAlias: "zone1-100"} func TestIsSmallerMajorVersion(t *testing.T) { i55 := Instance{Version: "5.5"} @@ -55,9 +55,9 @@ func TestIsVersion(t *testing.T) { } func TestIsSmallerBinlogFormat(t *testing.T) { - iStatement := &Instance{Key: key1, BinlogFormat: "STATEMENT"} - iRow := &Instance{Key: key2, BinlogFormat: "ROW"} - iMixed := &Instance{Key: key3, BinlogFormat: "MIXED"} + iStatement := &Instance{BinlogFormat: "STATEMENT"} + iRow := &Instance{BinlogFormat: "ROW"} + iMixed := &Instance{BinlogFormat: "MIXED"} require.True(t, iStatement.IsSmallerBinlogFormat(iRow)) require.False(t, iStatement.IsSmallerBinlogFormat(iStatement)) require.False(t, iRow.IsSmallerBinlogFormat(iStatement)) @@ -79,7 +79,7 @@ func TestReplicationThreads(t *testing.T) { require.True(t, instance1.ReplicationThreadsStopped()) } { - i := Instance{Key: key1, ReplicationIOThreadState: ReplicationThreadStateNoThread, ReplicationSQLThreadState: ReplicationThreadStateNoThread} + i := Instance{InstanceAlias: "zone1-100", ReplicationIOThreadState: ReplicationThreadStateNoThread, ReplicationSQLThreadState: ReplicationThreadStateNoThread} require.False(t, i.ReplicationThreadsExist()) } } diff --git a/go/vt/vtorc/inst/keyspace_dao.go b/go/vt/vtorc/inst/keyspace_dao.go index a06d9baa11e..d764e3fc56a 100644 --- a/go/vt/vtorc/inst/keyspace_dao.go +++ b/go/vt/vtorc/inst/keyspace_dao.go @@ -22,6 +22,7 @@ import ( "vitess.io/vitess/go/vt/external/golib/sqlutils" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/vtctl/reparentutil" "vitess.io/vitess/go/vt/vtorc/db" ) @@ -77,3 +78,12 @@ func SaveKeyspace(keyspace *topo.KeyspaceInfo) error { ) return err } + +// GetDurabilityPolicy gets the durability policy for the given keyspace. +func GetDurabilityPolicy(keyspace string) (reparentutil.Durabler, error) { + ki, err := ReadKeyspace(keyspace) + if err != nil { + return nil, err + } + return reparentutil.GetDurabilityPolicy(ki.DurabilityPolicy) +} diff --git a/go/vt/vtorc/inst/keyspace_dao_test.go b/go/vt/vtorc/inst/keyspace_dao_test.go index 56ad06ec9e5..015d3e75256 100644 --- a/go/vt/vtorc/inst/keyspace_dao_test.go +++ b/go/vt/vtorc/inst/keyspace_dao_test.go @@ -25,23 +25,24 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topotools" + "vitess.io/vitess/go/vt/vtctl/reparentutil" "vitess.io/vitess/go/vt/vtorc/db" ) func TestSaveAndReadKeyspace(t *testing.T) { - orcDb, err := db.OpenVTOrc() - require.NoError(t, err) + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. defer func() { - _, err = orcDb.Exec("delete from vitess_keyspace") - require.NoError(t, err) + db.ClearVTOrcDatabase() }() tests := []struct { - name string - keyspaceName string - keyspace *topodatapb.Keyspace - keyspaceWanted *topodatapb.Keyspace - err string + name string + keyspaceName string + keyspace *topodatapb.Keyspace + keyspaceWanted *topodatapb.Keyspace + err string + errInDurabilityPolicy string + semiSyncAckersWanted int }{ { name: "Success with keyspaceType and durability", @@ -50,16 +51,16 @@ func TestSaveAndReadKeyspace(t *testing.T) { KeyspaceType: topodatapb.KeyspaceType_NORMAL, DurabilityPolicy: "semi_sync", }, - keyspaceWanted: nil, - err: "", + keyspaceWanted: nil, + semiSyncAckersWanted: 1, }, { name: "Success with keyspaceType and no durability", keyspaceName: "ks2", keyspace: &topodatapb.Keyspace{ KeyspaceType: topodatapb.KeyspaceType_NORMAL, }, - keyspaceWanted: nil, - err: "", + keyspaceWanted: nil, + errInDurabilityPolicy: "durability policy not found", }, { name: "Success with snapshot keyspaceType", keyspaceName: "ks3", @@ -67,7 +68,6 @@ func TestSaveAndReadKeyspace(t *testing.T) { KeyspaceType: topodatapb.KeyspaceType_SNAPSHOT, }, keyspaceWanted: nil, - err: "", }, { name: "Success with fields that are not stored", keyspaceName: "ks4", @@ -80,7 +80,7 @@ func TestSaveAndReadKeyspace(t *testing.T) { KeyspaceType: topodatapb.KeyspaceType_NORMAL, DurabilityPolicy: "none", }, - err: "", + semiSyncAckersWanted: 0, }, { name: "No keyspace found", keyspaceName: "ks5", @@ -107,11 +107,21 @@ func TestSaveAndReadKeyspace(t *testing.T) { readKeyspaceInfo, err := ReadKeyspace(tt.keyspaceName) if tt.err != "" { require.EqualError(t, err, tt.err) - } else { - require.NoError(t, err) - require.True(t, topotools.KeyspaceEquality(tt.keyspaceWanted, readKeyspaceInfo.Keyspace)) - require.Equal(t, tt.keyspaceName, readKeyspaceInfo.KeyspaceName()) + return + } + require.NoError(t, err) + require.True(t, topotools.KeyspaceEquality(tt.keyspaceWanted, readKeyspaceInfo.Keyspace)) + require.Equal(t, tt.keyspaceName, readKeyspaceInfo.KeyspaceName()) + if tt.keyspace.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT { + return + } + durabilityPolicy, err := GetDurabilityPolicy(tt.keyspaceName) + if tt.errInDurabilityPolicy != "" { + require.EqualError(t, err, tt.errInDurabilityPolicy) + return } + require.NoError(t, err) + require.EqualValues(t, tt.semiSyncAckersWanted, reparentutil.SemiSyncAckers(durabilityPolicy, nil)) }) } } diff --git a/go/vt/vtorc/inst/maintenance.go b/go/vt/vtorc/inst/maintenance.go deleted file mode 100644 index 08fa3554d1e..00000000000 --- a/go/vt/vtorc/inst/maintenance.go +++ /dev/null @@ -1,45 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "vitess.io/vitess/go/vt/vtorc/config" -) - -// Maintenance indicates a maintenance entry (also in the database) -type Maintenance struct { - MaintenanceID uint - Key InstanceKey - BeginTimestamp string - SecondsElapsed uint - IsActive bool - Owner string - Reason string -} - -var maintenanceOwner string - -func GetMaintenanceOwner() string { - if maintenanceOwner != "" { - return maintenanceOwner - } - return config.MaintenanceOwner -} - -func SetMaintenanceOwner(owner string) { - maintenanceOwner = owner -} diff --git a/go/vt/vtorc/inst/maintenance_dao.go b/go/vt/vtorc/inst/maintenance_dao.go deleted file mode 100644 index b2ac833b353..00000000000 --- a/go/vt/vtorc/inst/maintenance_dao.go +++ /dev/null @@ -1,86 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - - "vitess.io/vitess/go/vt/log" - - "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/db" -) - -// ExpireMaintenance will remove the maintenance flag on old maintenances and on bounded maintenances -func ExpireMaintenance() error { - { - res, err := db.ExecVTOrc(` - delete from - database_instance_maintenance - where - maintenance_active is null - and end_timestamp < NOW() - INTERVAL ? DAY - `, - config.MaintenancePurgeDays, - ) - if err != nil { - log.Error(err) - return err - } - if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { - _ = AuditOperation("expire-maintenance", nil, fmt.Sprintf("Purged historical entries: %d", rowsAffected)) - } - } - { - res, err := db.ExecVTOrc(` - delete from - database_instance_maintenance - where - maintenance_active = 1 - and end_timestamp < NOW() - `, - ) - if err != nil { - log.Error(err) - return err - } - if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { - _ = AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired bounded: %d", rowsAffected)) - } - } - { - res, err := db.ExecVTOrc(` - delete from - database_instance_maintenance - where - explicitly_bounded = 0 - and concat(processing_node_hostname, ':', processing_node_token) not in ( - select concat(hostname, ':', token) from node_health - ) - `, - ) - if err != nil { - log.Error(err) - return err - } - if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { - _ = AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired dead: %d", rowsAffected)) - } - } - - return nil -} diff --git a/go/vt/vtorc/inst/resolve.go b/go/vt/vtorc/inst/resolve.go deleted file mode 100644 index ac3d3f6dc88..00000000000 --- a/go/vt/vtorc/inst/resolve.go +++ /dev/null @@ -1,265 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "errors" - "fmt" - "net" - "strings" - "sync" - "time" - - "github.com/patrickmn/go-cache" - - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/config" -) - -type HostnameResolve struct { - hostname string - resolvedHostname string -} - -func (hostnameResolve HostnameResolve) String() string { - return fmt.Sprintf("%s %s", hostnameResolve.hostname, hostnameResolve.resolvedHostname) -} - -type HostnameUnresolve struct { - hostname string - unresolvedHostname string -} - -func (hostnameUnresolve HostnameUnresolve) String() string { - return fmt.Sprintf("%s %s", hostnameUnresolve.hostname, hostnameUnresolve.unresolvedHostname) -} - -type HostnameRegistration struct { - CreatedAt time.Time - Key InstanceKey - Hostname string -} - -func NewHostnameRegistration(instanceKey *InstanceKey, hostname string) *HostnameRegistration { - return &HostnameRegistration{ - CreatedAt: time.Now(), - Key: *instanceKey, - Hostname: hostname, - } -} - -func NewHostnameDeregistration(instanceKey *InstanceKey) *HostnameRegistration { - return &HostnameRegistration{ - CreatedAt: time.Now(), - Key: *instanceKey, - Hostname: "", - } -} - -var hostnameResolvesLightweightCache *cache.Cache -var hostnameResolvesLightweightCacheInit = &sync.Mutex{} -var hostnameResolvesLightweightCacheLoadedOnceFromDB = false -var hostnameIPsCache = cache.New(10*time.Minute, time.Minute) - -func getHostnameResolvesLightweightCache() *cache.Cache { - hostnameResolvesLightweightCacheInit.Lock() - defer hostnameResolvesLightweightCacheInit.Unlock() - if hostnameResolvesLightweightCache == nil { - hostnameResolvesLightweightCache = cache.New(time.Duration(config.ExpiryHostnameResolvesMinutes)*time.Minute, time.Minute) - } - return hostnameResolvesLightweightCache -} - -func HostnameResolveMethodIsNone() bool { - return strings.ToLower(config.HostnameResolveMethod) == "none" -} - -// GetCNAME resolves an IP or hostname into a normalized valid CNAME -func GetCNAME(hostname string) (string, error) { - res, err := net.LookupCNAME(hostname) - if err != nil { - return hostname, err - } - res = strings.TrimRight(res, ".") - return res, nil -} - -func resolveHostname(hostname string) (string, error) { - switch strings.ToLower(config.HostnameResolveMethod) { - case "none": - return hostname, nil - case "default": - return hostname, nil - case "cname": - return GetCNAME(hostname) - case "ip": - return getHostnameIP(hostname) - } - return hostname, nil -} - -// Attempt to resolve a hostname. This may return a database cached hostname or otherwise -// it may resolve the hostname via CNAME -func ResolveHostname(hostname string) (string, error) { - hostname = strings.TrimSpace(hostname) - if hostname == "" { - return hostname, errors.New("Will not resolve empty hostname") - } - if strings.Contains(hostname, ",") { - return hostname, fmt.Errorf("Will not resolve multi-hostname: %+v", hostname) - } - if (&InstanceKey{Hostname: hostname}).IsDetached() { - // quietly abort. Nothing to do. The hostname is detached for a reason: it - // will not be resolved, for sure. - return hostname, nil - } - - // First go to lightweight cache - if resolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname); found { - return resolvedHostname.(string), nil - } - - if !hostnameResolvesLightweightCacheLoadedOnceFromDB { - // A continuous-discovery will first make sure to load all resolves from DB. - // However cli does not do so. - // Anyway, it seems like the cache was not loaded from DB. Before doing real resolves, - // let's try and get the resolved hostname from database. - if !HostnameResolveMethodIsNone() { - go func() { - if resolvedHostname, err := ReadResolvedHostname(hostname); err == nil && resolvedHostname != "" { - getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, 0) - } - }() - } - } - - // Unfound: resolve! - log.Infof("Hostname unresolved yet: %s", hostname) - resolvedHostname, err := resolveHostname(hostname) - if err != nil { - // Problem. What we'll do is cache the hostname for just one minute, so as to avoid flooding requests - // on one hand, yet make it refresh shortly on the other hand. Anyway do not write to database. - getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, time.Minute) - return hostname, err - } - // Good result! Cache it, also to DB - log.Infof("Cache hostname resolve %s as %s", hostname, resolvedHostname) - go UpdateResolvedHostname(hostname, resolvedHostname) - return resolvedHostname, nil -} - -// UpdateResolvedHostname will store the given resolved hostname in cache -// Returns false when the key already existed with same resolved value (similar -// to AFFECTED_ROWS() in mysql) -func UpdateResolvedHostname(hostname string, resolvedHostname string) bool { - if resolvedHostname == "" { - return false - } - if existingResolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname); found && (existingResolvedHostname == resolvedHostname) { - return false - } - getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, 0) - if !HostnameResolveMethodIsNone() { - _ = WriteResolvedHostname(hostname, resolvedHostname) - } - return true -} - -func LoadHostnameResolveCache() error { - if !HostnameResolveMethodIsNone() { - return loadHostnameResolveCacheFromDatabase() - } - return nil -} - -func loadHostnameResolveCacheFromDatabase() error { - allHostnamesResolves, err := ReadAllHostnameResolves() - if err != nil { - return err - } - for _, hostnameResolve := range allHostnamesResolves { - getHostnameResolvesLightweightCache().Set(hostnameResolve.hostname, hostnameResolve.resolvedHostname, 0) - } - hostnameResolvesLightweightCacheLoadedOnceFromDB = true - return nil -} - -func FlushNontrivialResolveCacheToDatabase() error { - if HostnameResolveMethodIsNone() { - return nil - } - items, _ := HostnameResolveCache() - for hostname := range items { - resolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname) - if found && (resolvedHostname.(string) != hostname) { - _ = WriteResolvedHostname(hostname, resolvedHostname.(string)) - } - } - return nil -} - -func HostnameResolveCache() (map[string]cache.Item, error) { - return getHostnameResolvesLightweightCache().Items(), nil -} - -func extractIPs(ips []net.IP) (ipv4String string, ipv6String string) { - for _, ip := range ips { - if ip4 := ip.To4(); ip4 != nil { - ipv4String = ip.String() - } else { - ipv6String = ip.String() - } - } - return ipv4String, ipv6String -} - -func getHostnameIPs(hostname string) (ips []net.IP, fromCache bool, err error) { - if ips, found := hostnameIPsCache.Get(hostname); found { - return ips.([]net.IP), true, nil - } - ips, err = net.LookupIP(hostname) - if err != nil { - log.Error(err) - return ips, false, err - } - hostnameIPsCache.Set(hostname, ips, cache.DefaultExpiration) - return ips, false, nil -} - -func getHostnameIP(hostname string) (ipString string, err error) { - ips, _, err := getHostnameIPs(hostname) - if err != nil { - return ipString, err - } - ipv4String, ipv6String := extractIPs(ips) - if ipv4String != "" { - return ipv4String, nil - } - return ipv6String, nil -} - -func ResolveHostnameIPs(hostname string) error { - ips, fromCache, err := getHostnameIPs(hostname) - if err != nil { - return err - } - if fromCache { - return nil - } - ipv4String, ipv6String := extractIPs(ips) - return writeHostnameIPs(hostname, ipv4String, ipv6String) -} diff --git a/go/vt/vtorc/inst/resolve_dao.go b/go/vt/vtorc/inst/resolve_dao.go deleted file mode 100644 index d38146469d2..00000000000 --- a/go/vt/vtorc/inst/resolve_dao.go +++ /dev/null @@ -1,219 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "github.com/rcrowley/go-metrics" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/config" - "vitess.io/vitess/go/vt/vtorc/db" -) - -var writeResolvedHostnameCounter = metrics.NewCounter() -var writeUnresolvedHostnameCounter = metrics.NewCounter() -var readResolvedHostnameCounter = metrics.NewCounter() -var readUnresolvedHostnameCounter = metrics.NewCounter() -var readAllResolvedHostnamesCounter = metrics.NewCounter() - -func init() { - _ = metrics.Register("resolve.write_resolved", writeResolvedHostnameCounter) - _ = metrics.Register("resolve.write_unresolved", writeUnresolvedHostnameCounter) - _ = metrics.Register("resolve.read_resolved", readResolvedHostnameCounter) - _ = metrics.Register("resolve.read_unresolved", readUnresolvedHostnameCounter) - _ = metrics.Register("resolve.read_resolved_all", readAllResolvedHostnamesCounter) -} - -// WriteResolvedHostname stores a hostname and the resolved hostname to backend database -func WriteResolvedHostname(hostname string, resolvedHostname string) error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - insert into - hostname_resolve (hostname, resolved_hostname, resolved_timestamp) - values - (?, ?, NOW()) - on duplicate key update - resolved_hostname = VALUES(resolved_hostname), - resolved_timestamp = VALUES(resolved_timestamp) - `, - hostname, - resolvedHostname) - if err != nil { - log.Error(err) - return err - } - if hostname != resolvedHostname { - // history is only interesting when there's actually something to resolve... - _, _ = db.ExecVTOrc(` - insert into - hostname_resolve_history (hostname, resolved_hostname, resolved_timestamp) - values - (?, ?, NOW()) - on duplicate key update - hostname=values(hostname), - resolved_timestamp=values(resolved_timestamp) - `, - hostname, - resolvedHostname) - } - writeResolvedHostnameCounter.Inc(1) - return nil - } - return ExecDBWriteFunc(writeFunc) -} - -// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists -func ReadResolvedHostname(hostname string) (string, error) { - var resolvedHostname string - - query := ` - select - resolved_hostname - from - hostname_resolve - where - hostname = ? - ` - - err := db.QueryVTOrc(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { - resolvedHostname = m.GetString("resolved_hostname") - return nil - }) - readResolvedHostnameCounter.Inc(1) - - if err != nil { - log.Error(err) - } - return resolvedHostname, err -} - -func ReadAllHostnameResolves() ([]HostnameResolve, error) { - res := []HostnameResolve{} - query := ` - select - hostname, - resolved_hostname - from - hostname_resolve - ` - err := db.QueryVTOrcRowsMap(query, func(m sqlutils.RowMap) error { - hostnameResolve := HostnameResolve{hostname: m.GetString("hostname"), resolvedHostname: m.GetString("resolved_hostname")} - - res = append(res, hostnameResolve) - return nil - }) - readAllResolvedHostnamesCounter.Inc(1) - - if err != nil { - log.Error(err) - } - return res, err -} - -// ExpireHostnameUnresolve expires hostname_unresolve entries that haven't been updated recently. -func ExpireHostnameUnresolve() error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - delete from hostname_unresolve - where last_registered < NOW() - INTERVAL ? MINUTE - `, config.ExpiryHostnameResolvesMinutes, - ) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} - -// ForgetExpiredHostnameResolves -func ForgetExpiredHostnameResolves() error { - _, err := db.ExecVTOrc(` - delete - from hostname_resolve - where - resolved_timestamp < NOW() - interval ? minute`, - 2*config.ExpiryHostnameResolvesMinutes, - ) - return err -} - -// DeleteInvalidHostnameResolves removes invalid resolves. At this time these are: -// - infinite loop resolves (A->B and B->A), remove earlier mapping -func DeleteInvalidHostnameResolves() error { - var invalidHostnames []string - - query := ` - select - early.hostname - from - hostname_resolve as latest - join hostname_resolve early on (latest.resolved_hostname = early.hostname and latest.hostname = early.resolved_hostname) - where - latest.hostname != latest.resolved_hostname - and latest.resolved_timestamp > early.resolved_timestamp - ` - - err := db.QueryVTOrcRowsMap(query, func(m sqlutils.RowMap) error { - invalidHostnames = append(invalidHostnames, m.GetString("hostname")) - return nil - }) - if err != nil { - return err - } - - for _, invalidHostname := range invalidHostnames { - _, err = db.ExecVTOrc(` - delete - from hostname_resolve - where - hostname = ?`, - invalidHostname, - ) - if err != nil { - log.Error(err) - } - } - return err -} - -// writeHostnameIPs stroes an ipv4 and ipv6 associated witha hostname, if available -func writeHostnameIPs(hostname string, ipv4String string, ipv6String string) error { - writeFunc := func() error { - _, err := db.ExecVTOrc(` - insert into - hostname_ips (hostname, ipv4, ipv6, last_updated) - values - (?, ?, ?, NOW()) - on duplicate key update - ipv4 = VALUES(ipv4), - ipv6 = VALUES(ipv6), - last_updated = VALUES(last_updated) - `, - hostname, - ipv4String, - ipv6String, - ) - if err != nil { - log.Error(err) - } - return err - } - return ExecDBWriteFunc(writeFunc) -} diff --git a/go/vt/vtorc/inst/tablet_dao.go b/go/vt/vtorc/inst/tablet_dao.go index 9ef8c1fde80..1a4c3b859c5 100644 --- a/go/vt/vtorc/inst/tablet_dao.go +++ b/go/vt/vtorc/inst/tablet_dao.go @@ -20,12 +20,9 @@ import ( "context" "errors" - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - "google.golang.org/protobuf/encoding/prototext" - "google.golang.org/protobuf/proto" + "vitess.io/vitess/go/vt/external/golib/sqlutils" "vitess.io/vitess/go/vt/logutil" replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" @@ -36,94 +33,12 @@ import ( "vitess.io/vitess/go/vt/vttablet/tmclient" ) -// TopoServ is the connection to the topo server. -var TopoServ *topo.Server - // ErrTabletAliasNil is a fixed error message. var ErrTabletAliasNil = errors.New("tablet alias is nil") -// SwitchPrimary makes the new tablet the primary and proactively performs -// the necessary propagation to the old primary. The propagation is best -// effort. If it fails, the tablet's shard sync will eventually converge. -// The proactive propagation allows a competing VTOrc from discovering -// the successful action of a previous one, which reduces churn. -func SwitchPrimary(newPrimaryKey, oldPrimaryKey InstanceKey) error { - durability, err := GetDurabilityPolicy(newPrimaryKey) - if err != nil { - return err - } - newPrimaryTablet, err := ChangeTabletType(newPrimaryKey, topodatapb.TabletType_PRIMARY, SemiSyncAckers(durability, newPrimaryKey) > 0) - if err != nil { - return err - } - // The following operations are best effort. - if newPrimaryTablet.Type != topodatapb.TabletType_PRIMARY { - log.Errorf("Unexpected: tablet type did not change to primary: %v", newPrimaryTablet.Type) - return nil - } - ctx, cancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout) - defer cancel() - _, err = TopoServ.UpdateShardFields(ctx, newPrimaryTablet.Keyspace, newPrimaryTablet.Shard, func(si *topo.ShardInfo) error { - if proto.Equal(si.PrimaryAlias, newPrimaryTablet.Alias) && proto.Equal(si.PrimaryTermStartTime, newPrimaryTablet.PrimaryTermStartTime) { - return topo.NewError(topo.NoUpdateNeeded, "") - } - - // We just successfully reparented. We should check timestamps, but always overwrite. - lastTerm := si.GetPrimaryTermStartTime() - newTerm := logutil.ProtoToTime(newPrimaryTablet.PrimaryTermStartTime) - if !newTerm.After(lastTerm) { - log.Errorf("Possible clock skew. New primary start time is before previous one: %v vs %v", newTerm, lastTerm) - } - - aliasStr := topoproto.TabletAliasString(newPrimaryTablet.Alias) - log.Infof("Updating shard record: primary_alias=%v, primary_term_start_time=%v", aliasStr, newTerm) - si.PrimaryAlias = newPrimaryTablet.Alias - si.PrimaryTermStartTime = newPrimaryTablet.PrimaryTermStartTime - return nil - }) - // Don't proceed if shard record could not be updated. - if err != nil { - log.Error(err) - return nil - } - if _, err := ChangeTabletType(oldPrimaryKey, topodatapb.TabletType_REPLICA, IsReplicaSemiSync(durability, newPrimaryKey, oldPrimaryKey)); err != nil { - // This is best effort. - log.Error(err) - } - return nil -} - -// ChangeTabletType designates the tablet that owns an instance as the primary. -func ChangeTabletType(instanceKey InstanceKey, tabletType topodatapb.TabletType, semiSync bool) (*topodatapb.Tablet, error) { - if instanceKey.Hostname == "" { - return nil, errors.New("can't set tablet to primary: instance is unspecified") - } - tablet, err := ReadTablet(instanceKey) - if err != nil { - return nil, err - } - tmc := tmclient.NewTabletManagerClient() - tmcCtx, tmcCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout) - defer tmcCancel() - if err := tmc.ChangeType(tmcCtx, tablet, tabletType, semiSync); err != nil { - return nil, err - } - tsCtx, tsCancel := context.WithTimeout(context.Background(), topo.RemoteOperationTimeout) - defer tsCancel() - ti, err := TopoServ.GetTablet(tsCtx, tablet.Alias) - if err != nil { - log.Error(err) - return nil, err - } - if err := SaveTablet(ti.Tablet); err != nil { - log.Error(err) - } - return ti.Tablet, nil -} - // ResetReplicationParameters resets the replication parameters on the given tablet. -func ResetReplicationParameters(instanceKey InstanceKey) error { - tablet, err := ReadTablet(instanceKey) +func ResetReplicationParameters(tabletAlias string) error { + tablet, err := ReadTablet(tabletAlias) if err != nil { return err } @@ -137,8 +52,8 @@ func ResetReplicationParameters(instanceKey InstanceKey) error { } // FullStatus gets the full status of the MySQL running in vttablet. -func FullStatus(instanceKey InstanceKey) (*replicationdatapb.FullStatus, error) { - tablet, err := ReadTablet(instanceKey) +func FullStatus(tabletAlias string) (*replicationdatapb.FullStatus, error) { + tablet, err := ReadTablet(tabletAlias) if err != nil { return nil, err } @@ -149,15 +64,15 @@ func FullStatus(instanceKey InstanceKey) (*replicationdatapb.FullStatus, error) } // ReadTablet reads the vitess tablet record. -func ReadTablet(instanceKey InstanceKey) (*topodatapb.Tablet, error) { +func ReadTablet(tabletAlias string) (*topodatapb.Tablet, error) { query := ` select info from vitess_tablet - where hostname=? and port=? + where alias = ? ` - args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) + args := sqlutils.Args(tabletAlias) tablet := &topodatapb.Tablet{} opts := prototext.UnmarshalOptions{DiscardUnknown: true} err := db.QueryVTOrc(query, args, func(row sqlutils.RowMap) error { diff --git a/go/vt/vtorc/inst/tablet_dao_test.go b/go/vt/vtorc/inst/tablet_dao_test.go new file mode 100644 index 00000000000..a876d857ace --- /dev/null +++ b/go/vt/vtorc/inst/tablet_dao_test.go @@ -0,0 +1,93 @@ +package inst + +import ( + "testing" + + "github.com/stretchr/testify/require" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/proto/vttime" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/topotools" + "vitess.io/vitess/go/vt/vtorc/db" +) + +func TestSaveAndReadTablet(t *testing.T) { + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + + tests := []struct { + name string + tabletAlias string + tablet *topodatapb.Tablet + tabletWanted *topodatapb.Tablet + err string + }{ + { + name: "Success with primary type", + tabletAlias: "zone1-0000000100", + tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 1030, + PrimaryTermStartTime: &vttime.Time{ + Seconds: 1000, + Nanoseconds: 387, + }, + }, + tabletWanted: nil, + }, { + name: "Success with replica type", + tabletAlias: "zone1-0000000100", + tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 1030, + }, + tabletWanted: nil, + }, { + name: "No tablet found", + tabletAlias: "zone1-190734", + tablet: nil, + tabletWanted: nil, + err: ErrTabletAliasNil.Error(), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.tabletWanted == nil { + tt.tabletWanted = tt.tablet + } + + if tt.tablet != nil { + err := SaveTablet(tt.tablet) + require.NoError(t, err) + } + + readTable, err := ReadTablet(tt.tabletAlias) + if tt.err != "" { + require.EqualError(t, err, tt.err) + return + } + require.NoError(t, err) + require.True(t, topotools.TabletEquality(tt.tabletWanted, readTable)) + require.Equal(t, tt.tabletAlias, topoproto.TabletAliasString(readTable.Alias)) + }) + } +} diff --git a/go/vt/vtorc/inst/tag.go b/go/vt/vtorc/inst/tag.go deleted file mode 100644 index 3b9705b7dff..00000000000 --- a/go/vt/vtorc/inst/tag.go +++ /dev/null @@ -1,121 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - "regexp" - "strings" -) - -type Tag struct { - TagName string - TagValue string - HasValue bool - Negate bool -} - -var ( - negateTagEqualsRegexp = regexp.MustCompile("^~([^=]+)=(.*)$") - TagEqualsRegexp = regexp.MustCompile("^([^=]+)=(.*)$") - negateTagExistsRegexp = regexp.MustCompile("^~([^=]+)$") - tagExistsRegexp = regexp.MustCompile("^([^=]+)$") -) - -func NewTag(tagName string, tagValue string) (*Tag, error) { - tagName = strings.TrimSpace(tagName) - if tagName == "" { - return nil, fmt.Errorf("NewTag: empty tag name") - } - return &Tag{TagName: tagName, TagValue: tagValue}, nil -} - -func ParseTag(tagString string) (*Tag, error) { - tagString = strings.Replace(tagString, "!", "~", -1) - tagString = strings.TrimSpace(tagString) - - if submatch := negateTagEqualsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { - return &Tag{ - TagName: submatch[1], - TagValue: submatch[2], - HasValue: true, - Negate: true, - }, nil - } else if submatch := TagEqualsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { - return &Tag{ - TagName: submatch[1], - TagValue: submatch[2], - HasValue: true, - }, nil - } else if submatch := negateTagExistsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { - return &Tag{ - TagName: submatch[1], - Negate: true, - }, nil - } else if submatch := tagExistsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { - return &Tag{ - TagName: submatch[1], - }, nil - } - return nil, fmt.Errorf("Unable to parse tag: %s", tagString) -} - -func (tag *Tag) String() string { - return fmt.Sprintf("%s=%s", tag.TagName, tag.TagValue) -} - -func (tag *Tag) Display() string { - if tag.TagValue == "" { - return tag.TagName - } - return fmt.Sprintf("%s=%s", tag.TagName, tag.TagValue) -} - -func ParseIntersectTags(tagsString string) (tags [](*Tag), err error) { - for _, tagString := range strings.Split(tagsString, ",") { - tag, err := ParseTag(tagString) - if err != nil { - return tags, err - } - tags = append(tags, tag) - } - return tags, nil -} - -type InstanceTag struct { - Key InstanceKey - T Tag -} - -func GetInstanceKeysByTags(tagsString string) (tagged *InstanceKeyMap, err error) { - tags, err := ParseIntersectTags(tagsString) - if err != nil { - return tagged, err - } - for i, tag := range tags { - taggedByTag, err := GetInstanceKeysByTag(tag) - if err != nil { - return tagged, err - } - if i == 0 { - tagged = taggedByTag - } else { - tagged = tagged.Intersect(taggedByTag) - } - } - return tagged, nil -} diff --git a/go/vt/vtorc/inst/tag_dao.go b/go/vt/vtorc/inst/tag_dao.go deleted file mode 100644 index 5b5962a9326..00000000000 --- a/go/vt/vtorc/inst/tag_dao.go +++ /dev/null @@ -1,206 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package inst - -import ( - "fmt" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/db" -) - -func PutInstanceTag(instanceKey *InstanceKey, tag *Tag) (err error) { - _, err = db.ExecVTOrc(` - insert - into database_instance_tags ( - hostname, port, tag_name, tag_value, last_updated - ) VALUES ( - ?, ?, ?, ?, NOW() - ) - on duplicate key update - tag_value=values(tag_value), - last_updated=values(last_updated) - `, - instanceKey.Hostname, - instanceKey.Port, - tag.TagName, - tag.TagValue, - ) - return err -} - -func Untag(instanceKey *InstanceKey, tag *Tag) (tagged *InstanceKeyMap, err error) { - if tag == nil { - errMsg := "untag: tag is nil" - log.Errorf(errMsg) - return nil, fmt.Errorf(errMsg) - } - if tag.Negate { - errMsg := "untag: does not support negation" - log.Errorf(errMsg) - return nil, fmt.Errorf(errMsg) - } - if instanceKey == nil && !tag.HasValue { - errMsg := "untag: either indicate an instance or a tag value. Will not delete on-valued tag across instances" - log.Errorf(errMsg) - return nil, fmt.Errorf(errMsg) - } - var clause string - args := sqlutils.Args() - if tag.HasValue { - clause = `tag_name=? and tag_value=?` - args = append(args, tag.TagName, tag.TagValue) - } else { - clause = `tag_name=?` - args = append(args, tag.TagName) - } - if instanceKey != nil { - clause = fmt.Sprintf("%s and hostname=? and port=?", clause) - args = append(args, instanceKey.Hostname, instanceKey.Port) - } - tagged = NewInstanceKeyMap() - query := fmt.Sprintf(` - select - hostname, - port - from - database_instance_tags - where - %s - order by hostname, port - `, clause, - ) - _ = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - key, _ := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) - tagged.AddKey(*key) - return nil - }) - - query = fmt.Sprintf(` - delete from - database_instance_tags - where - %s - `, clause, - ) - if _, err = db.ExecVTOrc(query, args...); err != nil { - log.Error(err) - return tagged, err - } - _ = AuditOperation("delete-instance-tag", instanceKey, tag.String()) - return tagged, nil -} - -func ReadInstanceTag(instanceKey *InstanceKey, tag *Tag) (tagExists bool, err error) { - query := ` - select - tag_value - from - database_instance_tags - where - hostname = ? - and port = ? - and tag_name = ? - ` - args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port, tag.TagName) - err = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - tag.TagValue = m.GetString("tag_value") - tagExists = true - return nil - }) - - if err != nil { - log.Error(err) - } - return tagExists, err -} - -func ReadInstanceTags(instanceKey *InstanceKey) (tags [](*Tag), err error) { - tags = [](*Tag){} - query := ` - select - tag_name, tag_value - from - database_instance_tags - where - hostname = ? - and port = ? - order by tag_name - ` - args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) - err = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - tag := &Tag{ - TagName: m.GetString("tag_name"), - TagValue: m.GetString("tag_value"), - } - tags = append(tags, tag) - return nil - }) - - if err != nil { - log.Error(err) - } - return tags, err -} - -func GetInstanceKeysByTag(tag *Tag) (tagged *InstanceKeyMap, err error) { - if tag == nil { - errMsg := "GetInstanceKeysByTag: tag is nil" - log.Errorf(errMsg) - return nil, fmt.Errorf(errMsg) - } - clause := `` - args := sqlutils.Args() - if tag.HasValue && !tag.Negate { - // exists and equals - clause = `tag_name=? and tag_value=?` - args = append(args, tag.TagName, tag.TagValue) - } else if !tag.HasValue && !tag.Negate { - // exists - clause = `tag_name=?` - args = append(args, tag.TagName) - } else if tag.HasValue && tag.Negate { - // exists and not equal - clause = `tag_name=? and tag_value!=?` - args = append(args, tag.TagName, tag.TagValue) - } else if !tag.HasValue && tag.Negate { - // does not exist - clause = `1=1 group by hostname, port having sum(tag_name=?)=0` - args = append(args, tag.TagName) - } - tagged = NewInstanceKeyMap() - query := fmt.Sprintf(` - select - hostname, - port - from - database_instance_tags - where - %s - order by hostname, port - `, clause) - err = db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { - key, _ := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) - tagged.AddKey(*key) - return nil - }) - if err != nil { - log.Error(err) - } - return tagged, err -} diff --git a/go/vt/vtorc/inst/tag_test.go b/go/vt/vtorc/inst/tag_test.go deleted file mode 100644 index 0ce182b7fb2..00000000000 --- a/go/vt/vtorc/inst/tag_test.go +++ /dev/null @@ -1,141 +0,0 @@ -package inst - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -//nolint:staticcheck -func TestParseTag(t *testing.T) { - { - tag, err := ParseTag("") - require.True(t, tag == nil) - require.Error(t, err) - } - { - tag, err := ParseTag("=") - require.True(t, tag == nil) - require.Error(t, err) - } - { - tag, err := ParseTag("=backup") - require.True(t, tag == nil) - require.Error(t, err) - } - { - tag, err := ParseTag(" =backup") - require.True(t, tag == nil) - require.Error(t, err) - } - { - tag, err := ParseTag("role") - require.NoError(t, err) - require.True(t, tag != nil) - require.Equal(t, tag.TagName, "role") - require.Equal(t, tag.TagValue, "") - require.False(t, tag.Negate) - require.False(t, tag.HasValue) - - require.Equal(t, tag.String(), "role=") - } - { - tag, err := ParseTag("role=") - require.NoError(t, err) - require.True(t, tag != nil) - require.Equal(t, tag.TagName, "role") - require.Equal(t, tag.TagValue, "") - require.False(t, tag.Negate) - require.True(t, tag.HasValue) - - require.Equal(t, tag.String(), "role=") - - } - { - tag, err := ParseTag("role=backup") - require.NoError(t, err) - require.True(t, tag != nil) - require.Equal(t, tag.TagName, "role") - require.Equal(t, tag.TagValue, "backup") - require.False(t, tag.Negate) - require.True(t, tag.HasValue) - - require.Equal(t, tag.String(), "role=backup") - } - { - tag, err := ParseTag("!role") - require.NoError(t, err) - require.True(t, tag != nil) - require.Equal(t, tag.TagName, "role") - require.True(t, tag.Negate) - require.False(t, tag.HasValue) - } - { - tag, err := ParseTag("~role=backup") - require.NoError(t, err) - require.True(t, tag != nil) - require.Equal(t, tag.TagName, "role") - require.Equal(t, tag.TagValue, "backup") - require.True(t, tag.Negate) - require.True(t, tag.HasValue) - } -} - -func TestParseIntersectTags(t *testing.T) { - { - _, err := ParseIntersectTags("") - require.Error(t, err) - } - { - _, err := ParseIntersectTags(",") - require.Error(t, err) - } - { - _, err := ParseIntersectTags(",,,") - require.Error(t, err) - } - { - _, err := ParseIntersectTags("role,") - require.Error(t, err) - } - { - tags, err := ParseIntersectTags("role") - require.NoError(t, err) - require.Equal(t, len(tags), 1) - - require.Equal(t, tags[0].TagName, "role") - require.Equal(t, tags[0].TagValue, "") - require.False(t, tags[0].Negate) - require.False(t, tags[0].HasValue) - } - { - tags, err := ParseIntersectTags("role,dc") - require.NoError(t, err) - require.Equal(t, len(tags), 2) - - require.Equal(t, tags[0].TagName, "role") - require.Equal(t, tags[0].TagValue, "") - require.False(t, tags[0].Negate) - require.False(t, tags[0].HasValue) - - require.Equal(t, tags[1].TagName, "dc") - require.Equal(t, tags[1].TagValue, "") - require.False(t, tags[1].Negate) - require.False(t, tags[1].HasValue) - } - { - tags, err := ParseIntersectTags("role=backup, !dc=ny") - require.NoError(t, err) - require.Equal(t, len(tags), 2) - - require.Equal(t, tags[0].TagName, "role") - require.Equal(t, tags[0].TagValue, "backup") - require.False(t, tags[0].Negate) - require.True(t, tags[0].HasValue) - - require.Equal(t, tags[1].TagName, "dc") - require.Equal(t, tags[1].TagValue, "ny") - require.True(t, tags[1].Negate) - require.True(t, tags[1].HasValue) - } -} diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index 92ffa8df583..30827036044 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -36,6 +36,7 @@ import ( "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/topotools" + "vitess.io/vitess/go/vt/vtctl/reparentutil" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" @@ -65,8 +66,6 @@ func RegisterFlags(fs *pflag.FlagSet) { func OpenTabletDiscovery() <-chan time.Time { // TODO(sougou): If there's a shutdown signal, we have to close the topo. ts = topo.Open() - // TODO(sougou): remove ts and push some functions into inst. - inst.TopoServ = ts tmc = tmclient.NewTabletManagerClient() // Clear existing cache and perform a new refresh. if _, err := db.ExecVTOrc("delete from vitess_tablet"); err != nil { @@ -77,12 +76,12 @@ func OpenTabletDiscovery() <-chan time.Time { // refreshAllTablets reloads the tablets from topo and discovers the ones which haven't been refreshed in a while func refreshAllTablets() { - refreshTabletsUsing(func(instanceKey *inst.InstanceKey) { - DiscoverInstance(*instanceKey, false /* forceDiscovery */) + refreshTabletsUsing(func(tabletAlias string) { + DiscoverInstance(tabletAlias, false /* forceDiscovery */) }, false /* forceRefresh */) } -func refreshTabletsUsing(loader func(instanceKey *inst.InstanceKey), forceRefresh bool) { +func refreshTabletsUsing(loader func(tabletAlias string), forceRefresh bool) { if !IsLeaderOrActive() { return } @@ -151,13 +150,13 @@ func refreshTabletsUsing(loader func(instanceKey *inst.InstanceKey), forceRefres } } -func refreshTabletsInCell(ctx context.Context, cell string, loader func(instanceKey *inst.InstanceKey), forceRefresh bool) { +func refreshTabletsInCell(ctx context.Context, cell string, loader func(tabletAlias string), forceRefresh bool) { tablets, err := topotools.GetTabletMapForCell(ctx, ts, cell) if err != nil { log.Errorf("Error fetching topo info for cell %v: %v", cell, err) return } - query := "select hostname, port, info from vitess_tablet where cell = ?" + query := "select alias from vitess_tablet where cell = ?" args := sqlutils.Args(cell) refreshTablets(tablets, query, args, loader, forceRefresh, nil) } @@ -168,8 +167,8 @@ func refreshTabletsInCell(ctx context.Context, cell string, loader func(instance func forceRefreshAllTabletsInShard(ctx context.Context, keyspace, shard string, tabletsToIgnore []string) { refreshCtx, refreshCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) defer refreshCancel() - refreshTabletsInKeyspaceShard(refreshCtx, keyspace, shard, func(instanceKey *inst.InstanceKey) { - DiscoverInstance(*instanceKey, true) + refreshTabletsInKeyspaceShard(refreshCtx, keyspace, shard, func(tabletAlias string) { + DiscoverInstance(tabletAlias, true) }, true, tabletsToIgnore) } @@ -177,24 +176,24 @@ func forceRefreshAllTabletsInShard(ctx context.Context, keyspace, shard string, // of the given keyspace-shard. func refreshTabletInfoOfShard(ctx context.Context, keyspace, shard string) { log.Infof("refresh of tablet records of shard - %v/%v", keyspace, shard) - refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(instanceKey *inst.InstanceKey) { + refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(tabletAlias string) { // No-op // We only want to refresh the tablet information for the given shard }, false, nil) } -func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string, loader func(instanceKey *inst.InstanceKey), forceRefresh bool, tabletsToIgnore []string) { +func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string, loader func(tabletAlias string), forceRefresh bool, tabletsToIgnore []string) { tablets, err := ts.GetTabletMapForShard(ctx, keyspace, shard) if err != nil { log.Errorf("Error fetching tablets for keyspace/shard %v/%v: %v", keyspace, shard, err) return } - query := "select hostname, port, info from vitess_tablet where keyspace = ? and shard = ?" + query := "select alias from vitess_tablet where keyspace = ? and shard = ?" args := sqlutils.Args(keyspace, shard) refreshTablets(tablets, query, args, loader, forceRefresh, tabletsToIgnore) } -func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []any, loader func(instanceKey *inst.InstanceKey), forceRefresh bool, tabletsToIgnore []string) { +func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []any, loader func(tabletAlias string), forceRefresh bool, tabletsToIgnore []string) { // Discover new tablets. // TODO(sougou): enhance this to work with multi-schema, // where each instanceKey can have multiple tablets. @@ -205,15 +204,12 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) { continue } - latestInstances[topoproto.TabletAliasString(tablet.Alias)] = true + tabletAliasString := topoproto.TabletAliasString(tablet.Alias) + latestInstances[tabletAliasString] = true if tablet.MysqlHostname == "" { continue } - instanceKey := inst.InstanceKey{ - Hostname: tablet.MysqlHostname, - Port: int(tablet.MysqlPort), - } - old, err := inst.ReadTablet(instanceKey) + old, err := inst.ReadTablet(tabletAliasString) if err != nil && err != inst.ErrTabletAliasNil { log.Error(err) continue @@ -231,55 +227,34 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an if slices.Contains(tabletsToIgnore, topoproto.TabletAliasString(tablet.Alias)) { return } - loader(&instanceKey) + loader(tabletAliasString) }() log.Infof("Discovered: %v", tablet) } wg.Wait() // Forget tablets that were removed. - toForget := make(map[inst.InstanceKey]*topodatapb.Tablet) + var toForget []string err := db.QueryVTOrc(query, args, func(row sqlutils.RowMap) error { - curKey := inst.InstanceKey{ - Hostname: row.GetString("hostname"), - Port: row.GetInt("port"), - } - tablet := &topodatapb.Tablet{} - opts := prototext.UnmarshalOptions{DiscardUnknown: true} - if err := opts.Unmarshal([]byte(row.GetString("info")), tablet); err != nil { - log.Error(err) - return nil - } - if !latestInstances[topoproto.TabletAliasString(tablet.Alias)] { - toForget[curKey] = tablet + tabletAlias := row.GetString("alias") + if !latestInstances[tabletAlias] { + toForget = append(toForget, tabletAlias) } return nil }) if err != nil { log.Error(err) } - for instanceKey, tablet := range toForget { - log.Infof("Forgetting: %v", tablet) - _, err := db.ExecVTOrc(` - delete - from vitess_tablet - where - hostname=? and port=?`, - instanceKey.Hostname, - instanceKey.Port, - ) - if err != nil { - log.Error(err) - } - if err := inst.ForgetInstance(&instanceKey); err != nil { + for _, tabletAlias := range toForget { + if err := inst.ForgetInstance(tabletAlias); err != nil { log.Error(err) } } } // LockShard locks the keyspace-shard preventing others from performing conflicting actions. -func LockShard(ctx context.Context, instanceKey inst.InstanceKey) (context.Context, func(*error), error) { - if instanceKey.Hostname == "" { +func LockShard(ctx context.Context, tabletAlias string) (context.Context, func(*error), error) { + if tabletAlias == "" { return nil, nil, errors.New("Can't lock shard: instance is unspecified") } val := atomic.LoadInt32(&hasReceivedSIGTERM) @@ -287,7 +262,7 @@ func LockShard(ctx context.Context, instanceKey inst.InstanceKey) (context.Conte return nil, nil, errors.New("Can't lock shard: SIGTERM received") } - tablet, err := inst.ReadTablet(instanceKey) + tablet, err := inst.ReadTablet(tabletAlias) if err != nil { return nil, nil, err } @@ -322,11 +297,7 @@ func setReplicationSource(ctx context.Context, replica *topodatapb.Tablet, prima // shardPrimary finds the primary of the given keyspace-shard by reading the vtorc backend func shardPrimary(keyspace string, shard string) (primary *topodatapb.Tablet, err error) { query := `SELECT - info, - hostname, - port, - tablet_type, - primary_timestamp + info FROM vitess_tablet WHERE @@ -351,10 +322,10 @@ func shardPrimary(keyspace string, shard string) (primary *topodatapb.Tablet, er } // restartsReplication restarts the replication on the provided replicaKey. It also sets the correct semi-sync settings when it starts replication -func restartReplication(replicaKey *inst.InstanceKey) error { - replicaTablet, err := inst.ReadTablet(*replicaKey) +func restartReplication(replicaAlias string) error { + replicaTablet, err := inst.ReadTablet(replicaAlias) if err != nil { - log.Info("Could not read tablet - %+v", replicaKey) + log.Info("Could not read tablet - %+v", replicaAlias) return err } @@ -364,7 +335,7 @@ func restartReplication(replicaKey *inst.InstanceKey) error { return err } - durabilityPolicy, err := inst.GetDurabilityPolicy(replicaTablet) + durabilityPolicy, err := inst.GetDurabilityPolicy(replicaTablet.Keyspace) if err != nil { log.Info("Could not read the durability policy for %v/%v", replicaTablet.Keyspace, replicaTablet.Shard) return err @@ -377,7 +348,7 @@ func restartReplication(replicaKey *inst.InstanceKey) error { log.Info("Could not stop replication on %v", topoproto.TabletAliasString(replicaTablet.Alias)) return err } - err = tmc.StartReplication(ctx, replicaTablet, inst.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet)) + err = tmc.StartReplication(ctx, replicaTablet, reparentutil.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet)) if err != nil { log.Info("Could not start replication on %v", topoproto.TabletAliasString(replicaTablet.Alias)) return err diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index ee107056033..1166dd2e40d 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -27,11 +27,10 @@ import ( "google.golang.org/protobuf/proto" "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/topo/topoproto" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/proto/vttime" "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" ) @@ -105,18 +104,14 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) { ts = oldTs }() - // Open the vtorc - // After the test completes delete everything from the vitess_tablet table - orcDb, err := db.OpenVTOrc() - require.NoError(t, err) + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. defer func() { - _, err = orcDb.Exec("delete from vitess_tablet") - require.NoError(t, err) + db.ClearVTOrcDatabase() }() // Create a memory topo-server and create the keyspace and shard records ts = memorytopo.NewServer(cell1) - _, err = ts.GetOrCreateShard(context.Background(), keyspace, shard) + _, err := ts.GetOrCreateShard(context.Background(), keyspace, shard) require.NoError(t, err) // Add tablets to the topo-server @@ -233,22 +228,16 @@ func TestShardPrimary(t *testing.T) { ts = oldTs }() - // Open the vtorc - // After the test completes delete everything from the vitess_tablet table - orcDb, err := db.OpenVTOrc() - require.NoError(t, err) - defer func() { - _, err = orcDb.Exec("delete from vitess_tablet") - require.NoError(t, err) - }() - for _, testcase := range testcases { t.Run(testcase.name, func(t *testing.T) { - _, err = orcDb.Exec("delete from vitess_tablet") + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() // Create a memory topo-server and create the keyspace and shard records ts = memorytopo.NewServer(cell1) - _, err = ts.GetOrCreateShard(context.Background(), keyspace, shard) + _, err := ts.GetOrCreateShard(context.Background(), keyspace, shard) require.NoError(t, err) // Add tablets to the topo-server @@ -279,7 +268,7 @@ func verifyRefreshTabletsInKeyspaceShard(t *testing.T, forceRefresh bool, instan var instancesRefreshed atomic.Int32 instancesRefreshed.Store(0) // call refreshTabletsInKeyspaceShard while counting all the instances that are refreshed - refreshTabletsInKeyspaceShard(context.Background(), keyspace, shard, func(instanceKey *inst.InstanceKey) { + refreshTabletsInKeyspaceShard(context.Background(), keyspace, shard, func(string) { instancesRefreshed.Add(1) }, forceRefresh, tabletsToIgnore) // Verify that all the tablets are present in the database @@ -295,16 +284,13 @@ func verifyRefreshTabletsInKeyspaceShard(t *testing.T, forceRefresh bool, instan // is the same as the one provided or reading it gives the same error as expected func verifyTabletInfo(t *testing.T, tabletWanted *topodatapb.Tablet, errString string) { t.Helper() - tabletKey := inst.InstanceKey{ - Hostname: hostname, - Port: int(tabletWanted.MysqlPort), - } - tablet, err := inst.ReadTablet(tabletKey) + tabletAlias := topoproto.TabletAliasString(tabletWanted.Alias) + tablet, err := inst.ReadTablet(tabletAlias) if errString != "" { assert.EqualError(t, err, errString) } else { assert.NoError(t, err) - assert.EqualValues(t, tabletKey.Port, tablet.MysqlPort) + assert.EqualValues(t, tabletAlias, topoproto.TabletAliasString(tablet.Alias)) diff := cmp.Diff(tablet, tabletWanted, cmp.Comparer(proto.Equal)) assert.Empty(t, diff) } diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 0d1e07f53cb..611636c6e20 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -26,12 +26,10 @@ import ( "github.com/patrickmn/go-cache" - logutilpb "vitess.io/vitess/go/vt/proto/logutil" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" + logutilpb "vitess.io/vitess/go/vt/proto/logutil" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtctl/reparentutil" "vitess.io/vitess/go/vt/vtorc/config" @@ -43,10 +41,6 @@ import ( type RecoveryType string const ( - PrimaryRecovery RecoveryType = "PrimaryRecovery" - CoPrimaryRecovery RecoveryType = "CoPrimaryRecovery" - IntermediatePrimaryRecovery RecoveryType = "IntermediatePrimaryRecovery" - CheckAndRecoverGenericProblemRecoveryName string = "CheckAndRecoverGenericProblem" RecoverDeadPrimaryRecoveryName string = "RecoverDeadPrimary" RecoverPrimaryHasPrimaryRecoveryName string = "RecoverPrimaryHasPrimary" @@ -93,60 +87,37 @@ const ( fixReplicaFunc ) -type RecoveryAcknowledgement struct { - CreatedAt time.Time - Owner string - Comment string - - Key inst.InstanceKey - ID int64 - UID string - AllRecoveries bool -} - -// BlockedTopologyRecovery represents an entry in the blocked_topology_recovery table -type BlockedTopologyRecovery struct { - FailedInstanceKey inst.InstanceKey - Analysis inst.AnalysisCode - LastBlockedTimestamp string - BlockingRecoveryID int64 -} - // TopologyRecovery represents an entry in the topology_recovery table type TopologyRecovery struct { inst.PostponedFunctionsContainer - ID int64 - UID string - AnalysisEntry inst.ReplicationAnalysis - SuccessorKey *inst.InstanceKey - SuccessorAlias string - IsActive bool - IsSuccessful bool - LostReplicas inst.InstanceKeyMap - ParticipatingInstanceKeys inst.InstanceKeyMap - AllErrors []string - RecoveryStartTimestamp string - RecoveryEndTimestamp string - ProcessingNodeHostname string - ProcessingNodeToken string - Acknowledged bool - AcknowledgedAt string - AcknowledgedBy string - AcknowledgedComment string - LastDetectionID int64 - RelatedRecoveryID int64 - Type RecoveryType - RecoveryType PrimaryRecoveryType + ID int64 + UID string + AnalysisEntry inst.ReplicationAnalysis + SuccessorHostname string + SuccessorPort int + SuccessorAlias string + IsActive bool + IsSuccessful bool + AllErrors []string + RecoveryStartTimestamp string + RecoveryEndTimestamp string + ProcessingNodeHostname string + ProcessingNodeToken string + Acknowledged bool + AcknowledgedAt string + AcknowledgedBy string + AcknowledgedComment string + LastDetectionID int64 + RelatedRecoveryID int64 + Type RecoveryType + RecoveryType PrimaryRecoveryType } func NewTopologyRecovery(replicationAnalysis inst.ReplicationAnalysis) *TopologyRecovery { topologyRecovery := &TopologyRecovery{} topologyRecovery.UID = util.PrettyUniqueToken() topologyRecovery.AnalysisEntry = replicationAnalysis - topologyRecovery.SuccessorKey = nil - topologyRecovery.LostReplicas = *inst.NewInstanceKeyMap() - topologyRecovery.ParticipatingInstanceKeys = *inst.NewInstanceKeyMap() topologyRecovery.AllErrors = []string{} topologyRecovery.RecoveryType = NotPrimaryRecovery return topologyRecovery @@ -217,7 +188,6 @@ func AuditTopologyRecovery(topologyRecovery *TopologyRecovery, message string) e func resolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst.Instance) error { if successorInstance != nil { - topologyRecovery.SuccessorKey = &successorInstance.Key topologyRecovery.SuccessorAlias = successorInstance.InstanceAlias topologyRecovery.IsSuccessful = true } @@ -225,13 +195,13 @@ func resolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst } // recoverPrimaryHasPrimary resets the replication on the primary instance -func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) if topologyRecovery == nil { - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceKey)) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } - log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) // This has to be done in the end; whether successful or not, we should mark that the recovery is done. // So that after the active period passes, we are able to run other recoveries. defer func() { @@ -239,7 +209,7 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.Replicatio }() // Reset replication on current primary. - err = inst.ResetReplicationParameters(analysisEntry.AnalyzedInstanceKey) + err = inst.ResetReplicationParameters(analysisEntry.AnalyzedInstanceAlias) if err != nil { return false, topologyRecovery, err } @@ -248,31 +218,23 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.Replicatio // recoverDeadPrimary checks a given analysis, decides whether to take action, and possibly takes action // Returns true when action was taken. -func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery) { +func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + if !analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery { return false, nil, nil } // Read the tablet information from the database to find the shard and keyspace of the tablet - tablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) + tablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias) if err != nil { return false, nil, err } - var candidateTabletAlias *topodatapb.TabletAlias - if candidateInstanceKey != nil { - candidateTablet, err := inst.ReadTablet(*candidateInstanceKey) - if err != nil { - return false, nil, err - } - candidateTabletAlias = candidateTablet.Alias - } - topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) if topologyRecovery == nil { - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadPrimary.", analysisEntry.AnalyzedInstanceKey)) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadPrimary.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } - log.Infof("Analysis: %v, deadprimary %+v with candidate %s", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateTabletAlias) + log.Infof("Analysis: %v, deadprimary %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) var promotedReplica *inst.Instance // This has to be done in the end; whether successful or not, we should mark that the recovery is done. // So that after the active period passes, we are able to run other recoveries. @@ -297,7 +259,6 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly tablet.Keyspace, tablet.Shard, reparentutil.EmergencyReparentOptions{ - NewPrimaryAlias: candidateTabletAlias, IgnoreReplicas: nil, WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover, @@ -308,80 +269,73 @@ func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnaly } if ev != nil && ev.NewPrimary != nil { - promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ - Hostname: ev.NewPrimary.MysqlHostname, - Port: int(ev.NewPrimary.MysqlPort), - }) + promotedReplica, _, _ = inst.ReadInstance(topoproto.TabletAliasString(ev.NewPrimary.Alias)) } - postErsCompletion(topologyRecovery, analysisEntry, skipProcesses, promotedReplica) + postErsCompletion(topologyRecovery, analysisEntry, promotedReplica) return true, topologyRecovery, err } -func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, skipProcesses bool, promotedReplica *inst.Instance) { +func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, promotedReplica *inst.Instance) { if promotedReplica != nil { - message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) + message := fmt.Sprintf("promoted replica: %+v", promotedReplica.InstanceAlias) _ = AuditTopologyRecovery(topologyRecovery, message) - _ = inst.AuditOperation("recover-dead-primary", &analysisEntry.AnalyzedInstanceKey, message) - } - // Now, see whether we are successful or not. From this point there's no going back. - if promotedReplica != nil { - // Success! - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadPrimary: successfully promoted %+v", promotedReplica.Key)) + _ = inst.AuditOperation("recover-dead-primary", analysisEntry.AnalyzedInstanceAlias, message) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadPrimary: successfully promoted %+v", promotedReplica.InstanceAlias)) } } // checkAndRecoverGenericProblem is a general-purpose recovery function -func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { return false, nil, nil } // checkAndRecoverGenericProblem is a general-purpose recovery function -func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (bool, *TopologyRecovery, error) { return false, nil, nil } // Force a re-read of a topology instance; this is done because we need to substantiate a suspicion // that we may have a failover scenario. we want to speed up reading the complete picture. -func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) (instance *inst.Instance) { - if existsInCacheError := emergencyReadTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { +func emergentlyReadTopologyInstance(tabletAlias string, analysisCode inst.AnalysisCode) (instance *inst.Instance) { + if existsInCacheError := emergencyReadTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted return nil } - instance, _ = inst.ReadTopologyInstance(instanceKey) - _ = inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode)) + instance, _ = inst.ReadTopologyInstance(tabletAlias) + _ = inst.AuditOperation("emergently-read-topology-instance", tabletAlias, string(analysisCode)) return instance } // Force reading of replicas of given instance. This is because we suspect the instance is dead, and want to speed up // detection of replication failure from its replicas. -func emergentlyReadTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { - replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) +func emergentlyReadTopologyInstanceReplicas(primaryHost string, primaryPort int, analysisCode inst.AnalysisCode) { + replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryHost, primaryPort) if err != nil { return } for _, replica := range replicas { - go emergentlyReadTopologyInstance(&replica.Key, analysisCode) + go emergentlyReadTopologyInstance(replica.InstanceAlias, analysisCode) } } // emergentlyRestartReplicationOnTopologyInstance forces a RestartReplication on a given instance. -func emergentlyRestartReplicationOnTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { - if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { +func emergentlyRestartReplicationOnTopologyInstance(tabletAlias string, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { // Just recently attempted on this specific replica return } go inst.ExecuteOnTopology(func() { - _ = restartReplication(instanceKey) - _ = inst.AuditOperation("emergently-restart-replication-topology-instance", instanceKey, string(analysisCode)) + _ = restartReplication(tabletAlias) + _ = inst.AuditOperation("emergently-restart-replication-topology-instance", tabletAlias, string(analysisCode)) }) } -func beginEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) { - emergencyOperationGracefulPeriodMap.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) +func beginEmergencyOperationGracefulPeriod(tabletAlias string) { + emergencyOperationGracefulPeriodMap.Set(tabletAlias, true, cache.DefaultExpiration) } -func isInEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) bool { - _, found := emergencyOperationGracefulPeriodMap.Get(instanceKey.StringCode()) +func isInEmergencyOperationGracefulPeriod(tabletAlias string) bool { + _, found := emergencyOperationGracefulPeriodMap.Get(tabletAlias) return found } @@ -390,26 +344,25 @@ func isInEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) bool { // This can be useful in scenarios where the primary has Too Many Connections, but long-time connected // replicas are not seeing this; when they stop+start replication, they need to re-authenticate and // that's where we hope they realize the primary is bad. -func emergentlyRestartReplicationOnTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { - if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { +func emergentlyRestartReplicationOnTopologyInstanceReplicas(primaryHost string, primaryPort int, tabletAlias string, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { // While each replica's RestartReplication() is throttled on its own, it's also wasteful to // iterate all replicas all the time. This is the reason why we do grand-throttle check. return } - beginEmergencyOperationGracefulPeriod(instanceKey) + beginEmergencyOperationGracefulPeriod(tabletAlias) - replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) + replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryHost, primaryPort) if err != nil { return } for _, replica := range replicas { - replicaKey := &replica.Key - go emergentlyRestartReplicationOnTopologyInstance(replicaKey, analysisCode) + go emergentlyRestartReplicationOnTopologyInstance(replica.InstanceAlias, analysisCode) } } -func emergentlyRecordStaleBinlogCoordinates(instanceKey *inst.InstanceKey, binlogCoordinates *inst.BinlogCoordinates) { - err := inst.RecordStaleInstanceBinlogCoordinates(instanceKey, binlogCoordinates) +func emergentlyRecordStaleBinlogCoordinates(tabletAlias string, binlogCoordinates *inst.BinlogCoordinates) { + err := inst.RecordStaleInstanceBinlogCoordinates(tabletAlias, binlogCoordinates) if err != nil { log.Error(err) } @@ -417,19 +370,19 @@ func emergentlyRecordStaleBinlogCoordinates(instanceKey *inst.InstanceKey, binlo // checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes // failure-detection processes. -func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { +func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { if ok, _ := AttemptFailureDetectionRegistration(&analysisEntry); !ok { - if util.ClearToLog("checkAndExecuteFailureDetectionProcesses", analysisEntry.AnalyzedInstanceKey.StringCode()) { - log.Infof("checkAndExecuteFailureDetectionProcesses: could not register %+v detection on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + if util.ClearToLog("checkAndExecuteFailureDetectionProcesses", analysisEntry.AnalyzedInstanceAlias) { + log.Infof("checkAndExecuteFailureDetectionProcesses: could not register %+v detection on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) } return false, false, nil } - log.Infof("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + log.Infof("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) return true, false, nil } // getCheckAndRecoverFunctionCode gets the recovery function code to use for the given analysis. -func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) recoveryFunction { +func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, tabletAlias string) recoveryFunction { switch analysisCode { // primary case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas: @@ -437,14 +390,14 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, analyzedInst if !config.ERSEnabled() { return noRecoveryFunc } - if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { + if isInEmergencyOperationGracefulPeriod(tabletAlias) { return recoverGenericProblemFunc } return recoverDeadPrimaryFunc case inst.PrimaryHasPrimary: return recoverPrimaryHasPrimaryFunc case inst.LockedSemiSyncPrimary: - if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { + if isInEmergencyOperationGracefulPeriod(tabletAlias) { return recoverGenericProblemFunc } return recoverLockedSemiSyncPrimaryFunc @@ -502,7 +455,7 @@ func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool { // getCheckAndRecoverFunction gets the recovery function for the given code. func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) ( - checkAndRecoverFunction func(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), + checkAndRecoverFunction func(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), ) { switch recoveryFunctionCode { case noRecoveryFunc: @@ -563,37 +516,37 @@ func isClusterWideRecovery(recoveryFunctionCode recoveryFunction) bool { // analysisEntriesHaveSameRecovery tells whether the two analysis entries have the same recovery function or not func analysisEntriesHaveSameRecovery(prevAnalysis, newAnalysis inst.ReplicationAnalysis) bool { - prevRecoveryFunctionCode := getCheckAndRecoverFunctionCode(prevAnalysis.Analysis, &prevAnalysis.AnalyzedInstanceKey) - newRecoveryFunctionCode := getCheckAndRecoverFunctionCode(newAnalysis.Analysis, &newAnalysis.AnalyzedInstanceKey) + prevRecoveryFunctionCode := getCheckAndRecoverFunctionCode(prevAnalysis.Analysis, prevAnalysis.AnalyzedInstanceAlias) + newRecoveryFunctionCode := getCheckAndRecoverFunctionCode(newAnalysis.Analysis, newAnalysis.AnalyzedInstanceAlias) return prevRecoveryFunctionCode == newRecoveryFunctionCode } func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { switch analysisEntry.Analysis { case inst.DeadPrimaryAndReplicas: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstancePrimaryKey, analysisEntry.Analysis) + go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstancePrimaryAlias, analysisEntry.Analysis) case inst.UnreachablePrimary: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) - go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) + go emergentlyReadTopologyInstanceReplicas(analysisEntry.AnalyzedInstanceHostname, analysisEntry.AnalyzedInstancePort, analysisEntry.Analysis) case inst.UnreachablePrimaryWithLaggingReplicas: - go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyRestartReplicationOnTopologyInstanceReplicas(analysisEntry.AnalyzedInstanceHostname, analysisEntry.AnalyzedInstancePort, analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) case inst.LockedSemiSyncPrimaryHypothesis: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) - go emergentlyRecordStaleBinlogCoordinates(&analysisEntry.AnalyzedInstanceKey, &analysisEntry.AnalyzedInstanceBinlogCoordinates) + go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) + go emergentlyRecordStaleBinlogCoordinates(analysisEntry.AnalyzedInstanceAlias, &analysisEntry.AnalyzedInstanceBinlogCoordinates) case inst.AllPrimaryReplicasNotReplicating: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) case inst.AllPrimaryReplicasNotReplicatingOrDead: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) } } // executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. // It executes the function synchronuously -func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis) (err error) { countPendingRecoveries.Add(1) defer countPendingRecoveries.Add(-1) - checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) + checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery runEmergentOperations(&analysisEntry) @@ -601,26 +554,26 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand if checkAndRecoverFunctionCode == noRecoveryFunc { // Unhandled problem type if analysisEntry.Analysis != inst.NoProblem { - if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceKey.StringCode()) { + if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceAlias) { log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) } } - return false, nil, nil + return nil } // we have a recovery function; its execution still depends on filters if not disabled. - if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceKey.StringCode()) { - log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) + if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) { + log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) } // At this point we have validated there's a failure scenario for which we have a recovery path. // Initiate detection: - _, _, err = checkAndExecuteFailureDetectionProcesses(analysisEntry, skipProcesses) + _, _, err = checkAndExecuteFailureDetectionProcesses(analysisEntry) if err != nil { log.Errorf("executeCheckAndRecoverFunction: error on failure detection: %+v", err) - return false, nil, err + return err } // We don't mind whether detection really executed the processes or not // (it may have been silenced due to previous detection). We only care there's no error. @@ -632,22 +585,17 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand // Unexpected. Shouldn't get this log.Errorf("Unable to determine if recovery is disabled globally: %v", err) } else if recoveryDisabledGlobally { - if !forceInstanceRecovery { - log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ - "skipProcesses: %v: NOT Recovering host (disabled globally)", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) - - return false, nil, err - } log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ - "skipProcesses: %v: recoveries disabled globally but forcing this recovery", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) + "skipProcesses: %v: NOT Recovering host (disabled globally)", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false) + + return err } // We lock the shard here and then refresh the tablets information - ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) + ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceAlias) if err != nil { - return false, nil, err + return err } defer unlock(&err) @@ -662,7 +610,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand // If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore err := RefreshKeyspace(analysisEntry.AnalyzedKeyspace) if err != nil { - return false, nil, err + return err } // If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets // of a shard because a new tablet could have been promoted, and we need to have this visibility before we @@ -670,7 +618,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand if isClusterWideRecovery(checkAndRecoverFunctionCode) { var tabletsToIgnore []string if checkAndRecoverFunctionCode == recoverDeadPrimaryFunc { - tabletsToIgnore = append(tabletsToIgnore, topoproto.TabletAliasString(analysisEntry.AnalyzedInstanceAlias)) + tabletsToIgnore = append(tabletsToIgnore, analysisEntry.AnalyzedInstanceAlias) } // We ignore the dead primary tablet because it is going to be unreachable. If all the other tablets aren't able to reach this tablet either, // we can proceed with the dead primary recovery. We don't need to refresh the information for this dead tablet. @@ -683,42 +631,39 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand // So, we only need to refresh the tablet info records (to know if the primary tablet has changed), // and the replication data of the new primary and this tablet. refreshTabletInfoOfShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) - DiscoverInstance(analysisEntry.AnalyzedInstanceKey, true) + DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true) primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) if err != nil { log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while finding the shard primary: %v", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) - return false, nil, err - } - primaryInstanceKey := inst.InstanceKey{ - Hostname: primaryTablet.MysqlHostname, - Port: int(primaryTablet.MysqlPort), + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err) + return err } + primaryTabletAlias := topoproto.TabletAliasString(primaryTablet.Alias) // We can skip the refresh if we know the tablet we are looking at is the primary tablet. // This would be the case for PrimaryHasPrimary recovery. We don't need to refresh the same tablet twice. - if !analysisEntry.AnalyzedInstanceKey.Equals(&primaryInstanceKey) { - DiscoverInstance(primaryInstanceKey, true) + if analysisEntry.AnalyzedInstanceAlias != primaryTabletAlias { + DiscoverInstance(primaryTabletAlias, true) } } alreadyFixed, err := checkIfAlreadyFixed(analysisEntry) if err != nil { log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) - return false, nil, err + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err) + return err } if alreadyFixed { log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) - return false, nil, nil + return nil } } // Actually attempt recovery: - if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceKey.StringCode()) { - log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) + if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) { + log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery, false) } - recoveryAttempted, topologyRecovery, err = getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry) if !recoveryAttempted { - return recoveryAttempted, topologyRecovery, err + return err } recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) recoveriesCounter.Add(recoveryName, 1) @@ -728,7 +673,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand recoveriesSuccessfulCounter.Add(recoveryName, 1) } if topologyRecovery == nil { - return recoveryAttempted, topologyRecovery, err + return err } if b, err := json.Marshal(topologyRecovery); err == nil { log.Infof("Topology recovery: %+v", string(b)) @@ -745,7 +690,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand // For all other recoveries, we would have changed the replication status of the analyzed tablet // so it doesn't hurt to re-read the information of this tablet, otherwise we'll requeue the same recovery // that we just completed because we would be using stale data. - DiscoverInstance(analysisEntry.AnalyzedInstanceKey, true) + DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true) } _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Waiting for %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) topologyRecovery.Wait() @@ -753,7 +698,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand if topologyRecovery.PostponedFunctionsContainer.Len() > 0 { _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Executed postponed functions: %+v", strings.Join(topologyRecovery.PostponedFunctionsContainer.Descriptions(), ", "))) } - return recoveryAttempted, topologyRecovery, err + return err } // checkIfAlreadyFixed checks whether the problem that the analysis entry represents has already been fixed by another agent or not @@ -766,7 +711,7 @@ func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { for _, entry := range analysisEntries { // If there is a analysis which has the same recovery required, then we should proceed with the recovery - if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) && analysisEntriesHaveSameRecovery(analysisEntry, entry) { + if entry.AnalyzedInstanceAlias == analysisEntry.AnalyzedInstanceAlias && analysisEntriesHaveSameRecovery(analysisEntry, entry) { return false, nil } } @@ -776,67 +721,41 @@ func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { } // CheckAndRecover is the main entry point for the recovery mechanism -func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { +func CheckAndRecover() { // Allow the analysis to run even if we don't want to recover - replicationAnalysis, err := inst.GetReplicationAnalysis("", "", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) + replicationAnalysis, err := inst.GetReplicationAnalysis("", "", &inst.ReplicationAnalysisHints{AuditAnalysis: true}) if err != nil { log.Error(err) - return false, nil, err + return } // intentionally iterating entries in random order for _, j := range rand.Perm(len(replicationAnalysis)) { analysisEntry := replicationAnalysis[j] - if specificInstance != nil { - // We are looking for a specific instance; if this is not the one, skip! - if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { - continue - } - } - if analysisEntry.SkippableDueToDowntime && specificInstance == nil { - // Only recover a downtimed server if explicitly requested - continue - } - if specificInstance != nil { - // force mode. Keep it synchronuous - var topologyRecovery *TopologyRecovery - recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) + go func() { + err = executeCheckAndRecoverFunction(analysisEntry) if err != nil { log.Error(err) } - if topologyRecovery != nil { - promotedReplicaKey = topologyRecovery.SuccessorKey - } - } else { - go func() { - _, _, err := executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) - if err != nil { - log.Error(err) - } - }() - } + }() + } - return recoveryAttempted, promotedReplicaKey, err } func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, promotedReplica *inst.Instance) { if promotedReplica != nil { - message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) + message := fmt.Sprintf("promoted replica: %+v", promotedReplica.InstanceAlias) _ = AuditTopologyRecovery(topologyRecovery, message) - _ = inst.AuditOperation(string(analysisEntry.Analysis), &analysisEntry.AnalyzedInstanceKey, message) - } - // Now, see whether we are successful or not. From this point there's no going back. - if promotedReplica != nil { - // Success! - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.Key)) + _ = inst.AuditOperation(string(analysisEntry.Analysis), analysisEntry.AnalyzedInstanceAlias, message) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.InstanceAlias)) } } // electNewPrimary elects a new primary while none were present before. -func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false /*failIfFailedInstanceInActiveRecovery*/, true /*failIfClusterInActiveRecovery*/) if topologyRecovery == nil || err != nil { - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceKey)) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } log.Infof("Analysis: %v, will elect a new primary for %v:%v", analysisEntry.Analysis, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard) @@ -848,7 +767,7 @@ func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis _ = resolveRecovery(topologyRecovery, promotedReplica) }() - analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) + analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias) if err != nil { return false, topologyRecovery, err } @@ -874,61 +793,58 @@ func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis ) if ev != nil && ev.NewPrimary != nil { - promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ - Hostname: ev.NewPrimary.MysqlHostname, - Port: int(ev.NewPrimary.MysqlPort), - }) + promotedReplica, _, _ = inst.ReadInstance(topoproto.TabletAliasString(ev.NewPrimary.Alias)) } postPrsCompletion(topologyRecovery, analysisEntry, promotedReplica) return true, topologyRecovery, err } // fixPrimary sets the primary as read-write. -func fixPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func fixPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) if topologyRecovery == nil { - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceKey)) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } - log.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + log.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) // This has to be done in the end; whether successful or not, we should mark that the recovery is done. // So that after the active period passes, we are able to run other recoveries. defer func() { _ = resolveRecovery(topologyRecovery, nil) }() - analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) + analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias) if err != nil { return false, topologyRecovery, err } - durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet) + durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet.Keyspace) if err != nil { log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard) return false, topologyRecovery, err } - if err := tabletUndoDemotePrimary(ctx, analyzedTablet, inst.SemiSyncAckers(durabilityPolicy, analyzedTablet) > 0); err != nil { + if err := tabletUndoDemotePrimary(ctx, analyzedTablet, reparentutil.SemiSyncAckers(durabilityPolicy, analyzedTablet) > 0); err != nil { return true, topologyRecovery, err } return true, topologyRecovery, nil } // fixReplica sets the replica as read-only and points it at the current primary. -func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) if topologyRecovery == nil { - _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceKey)) + _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } - log.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + log.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) // This has to be done in the end; whether successful or not, we should mark that the recovery is done. // So that after the active period passes, we are able to run other recoveries. defer func() { _ = resolveRecovery(topologyRecovery, nil) }() - analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceKey) + analyzedTablet, err := inst.ReadTablet(analysisEntry.AnalyzedInstanceAlias) if err != nil { return false, topologyRecovery, err } @@ -939,7 +855,7 @@ func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis, can return false, topologyRecovery, err } - durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet) + durabilityPolicy, err := inst.GetDurabilityPolicy(analyzedTablet.Keyspace) if err != nil { log.Info("Could not read the durability policy for %v/%v", analyzedTablet.Keyspace, analyzedTablet.Shard) return false, topologyRecovery, err @@ -951,6 +867,6 @@ func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis, can return true, topologyRecovery, err } - err = setReplicationSource(ctx, analyzedTablet, primaryTablet, inst.IsReplicaSemiSync(durabilityPolicy, primaryTablet, analyzedTablet)) + err = setReplicationSource(ctx, analyzedTablet, primaryTablet, reparentutil.IsReplicaSemiSync(durabilityPolicy, primaryTablet, analyzedTablet)) return true, topologyRecovery, err } diff --git a/go/vt/vtorc/logic/topology_recovery_dao.go b/go/vt/vtorc/logic/topology_recovery_dao.go index 13ecba5762c..20e23975499 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao.go +++ b/go/vt/vtorc/logic/topology_recovery_dao.go @@ -32,8 +32,7 @@ import ( // AttemptFailureDetectionRegistration tries to add a failure-detection entry; if this fails that means the problem has already been detected func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis) (registrationSuccessful bool, err error) { args := sqlutils.Args( - analysisEntry.AnalyzedInstanceKey.Hostname, - analysisEntry.AnalyzedInstanceKey.Port, + analysisEntry.AnalyzedInstanceAlias, process.ThisHostname, util.ProcessToken.Hash, string(analysisEntry.Analysis), @@ -51,8 +50,7 @@ func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis query := fmt.Sprintf(` insert ignore into topology_failure_detection ( - hostname, - port, + alias, in_active_period, end_active_period_unixtime, processing_node_hostname, @@ -64,7 +62,6 @@ func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis is_actionable, start_active_period ) values ( - ?, ?, 1, 0, @@ -118,8 +115,7 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover into topology_recovery ( recovery_id, uid, - hostname, - port, + alias, in_active_period, start_active_period, end_active_period_unixtime, @@ -134,7 +130,6 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover ?, ?, ?, - ?, 1, NOW(), 0, @@ -144,18 +139,18 @@ func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecover ?, ?, ?, - (select ifnull(max(detection_id), 0) from topology_failure_detection where hostname=? and port=?) + (select ifnull(max(detection_id), 0) from topology_failure_detection where alias = ?) ) `, sqlutils.NilIfZero(topologyRecovery.ID), topologyRecovery.UID, - analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, + analysisEntry.AnalyzedInstanceAlias, process.ThisHostname, util.ProcessToken.Hash, string(analysisEntry.Analysis), analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, analysisEntry.CountReplicas, - analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, + analysisEntry.AnalyzedInstanceAlias, ) if err != nil { return nil, err @@ -180,14 +175,14 @@ func AttemptRecoveryRegistration(analysisEntry *inst.ReplicationAnalysis, failIf if failIfFailedInstanceInActiveRecovery { // Let's check if this instance has just been promoted recently and is still in active period. // If so, we reject recovery registration to avoid flapping. - recoveries, err := ReadInActivePeriodSuccessorInstanceRecovery(&analysisEntry.AnalyzedInstanceKey) + recoveries, err := ReadInActivePeriodSuccessorInstanceRecovery(analysisEntry.AnalyzedInstanceAlias) if err != nil { log.Error(err) return nil, err } if len(recoveries) > 0 { _ = RegisterBlockedRecoveries(analysisEntry, recoveries) - errMsg := fmt.Sprintf("AttemptRecoveryRegistration: instance %+v has recently been promoted (by failover of %+v) and is in active period. It will not be failed over. You may acknowledge the failure on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) + errMsg := fmt.Sprintf("AttemptRecoveryRegistration: tablet %+v has recently been promoted (by failover of %+v) and is in active period. It will not be failed over. You may acknowledge the failure on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.AnalyzedInstanceAlias, recoveries[0].AnalysisEntry.AnalyzedInstanceAlias, recoveries[0].AnalysisEntry.AnalyzedInstanceAlias) log.Errorf(errMsg) return nil, fmt.Errorf(errMsg) } @@ -202,14 +197,14 @@ func AttemptRecoveryRegistration(analysisEntry *inst.ReplicationAnalysis, failIf } if len(recoveries) > 0 { _ = RegisterBlockedRecoveries(analysisEntry, recoveries) - errMsg := fmt.Sprintf("AttemptRecoveryRegistration: keyspace %+v shard %+v has recently experienced a failover (of %+v) and is in active period. It will not be failed over again. You may acknowledge the failure on this cluster (-c ack-cluster-recoveries) or on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) + errMsg := fmt.Sprintf("AttemptRecoveryRegistration: keyspace %+v shard %+v has recently experienced a failover (of %+v) and is in active period. It will not be failed over again. You may acknowledge the failure on this cluster (-c ack-cluster-recoveries) or on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, recoveries[0].AnalysisEntry.AnalyzedInstanceAlias, recoveries[0].AnalysisEntry.AnalyzedInstanceAlias) log.Errorf(errMsg) return nil, fmt.Errorf(errMsg) } } if !failIfFailedInstanceInActiveRecovery { // Implicitly acknowledge this instance's possibly existing active recovery, provided they are completed. - _, _ = AcknowledgeInstanceCompletedRecoveries(&analysisEntry.AnalyzedInstanceKey, "vtorc", fmt.Sprintf("implicit acknowledge due to user invocation of recovery on same instance: %+v", analysisEntry.AnalyzedInstanceKey)) + _, _ = AcknowledgeInstanceCompletedRecoveries(analysisEntry.AnalyzedInstanceAlias, "vtorc", fmt.Sprintf("implicit acknowledge due to user invocation of recovery on same instance: %+v", analysisEntry.AnalyzedInstanceAlias)) // The fact we only acknowledge a completed recovery solves the possible case of two DBAs simultaneously // trying to recover the same instance at the same time } @@ -250,8 +245,7 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking _, err := db.ExecVTOrc(` insert into blocked_topology_recovery ( - hostname, - port, + alias, keyspace, shard, analysis, @@ -262,7 +256,6 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking ?, ?, ?, - ?, NOW(), ? ) @@ -272,8 +265,7 @@ func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blocking analysis=values(analysis), last_blocked_timestamp=values(last_blocked_timestamp), blocking_recovery_id=values(blocking_recovery_id) - `, analysisEntry.AnalyzedInstanceKey.Hostname, - analysisEntry.AnalyzedInstanceKey.Port, + `, analysisEntry.AnalyzedInstanceAlias, analysisEntry.ClusterDetails.Keyspace, analysisEntry.ClusterDetails.Shard, string(analysisEntry.Analysis), @@ -294,30 +286,27 @@ func ExpireBlockedRecoveries() error { query := ` select - blocked_topology_recovery.hostname, - blocked_topology_recovery.port + blocked_topology_recovery.alias from blocked_topology_recovery left join topology_recovery on (blocking_recovery_id = topology_recovery.recovery_id and acknowledged = 0) where acknowledged is null ` - expiredKeys := inst.NewInstanceKeyMap() + var expiredAliases []string err := db.QueryVTOrc(query, sqlutils.Args(), func(m sqlutils.RowMap) error { - key := inst.InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} - expiredKeys.AddKey(key) + expiredAliases = append(expiredAliases, m.GetString("alias")) return nil }) - for _, expiredKey := range expiredKeys.GetInstanceKeys() { + for _, expiredAlias := range expiredAliases { _, err := db.ExecVTOrc(` delete from blocked_topology_recovery where - hostname = ? - and port = ? + alias = ? `, - expiredKey.Hostname, expiredKey.Port, + expiredAlias, ) if err != nil { log.Error(err) @@ -382,13 +371,12 @@ func acknowledgeRecoveries(owner string, comment string, markEndRecovery bool, w // AcknowledgeInstanceCompletedRecoveries marks active and COMPLETED recoveries for given instane as acknowledged. // This also implied clearing their active period, which in turn enables further recoveries on those topologies -func AcknowledgeInstanceCompletedRecoveries(instanceKey *inst.InstanceKey, owner string, comment string) (countAcknowledgedEntries int64, err error) { +func AcknowledgeInstanceCompletedRecoveries(tabletAlias string, owner string, comment string) (countAcknowledgedEntries int64, err error) { whereClause := ` - hostname = ? - and port = ? + alias = ? and end_recovery is not null ` - return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args(instanceKey.Hostname, instanceKey.Port)) + return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args(tabletAlias)) } // AcknowledgeCrashedRecoveries marks recoveries whose processing nodes has crashed as acknowledged. @@ -406,25 +394,16 @@ func AcknowledgeCrashedRecoveries() (countAcknowledgedEntries int64, err error) // ResolveRecovery is called on completion of a recovery process and updates the recovery status. // It does not clear the "active period" as this still takes place in order to avoid flapping. func writeResolveRecovery(topologyRecovery *TopologyRecovery) error { - var successorKeyToWrite inst.InstanceKey - if topologyRecovery.IsSuccessful { - successorKeyToWrite = *topologyRecovery.SuccessorKey - } _, err := db.ExecVTOrc(` update topology_recovery set is_successful = ?, - successor_hostname = ?, - successor_port = ?, successor_alias = ?, - lost_replicas = ?, - participating_instances = ?, all_errors = ?, end_recovery = NOW() where uid = ? - `, topologyRecovery.IsSuccessful, successorKeyToWrite.Hostname, successorKeyToWrite.Port, - topologyRecovery.SuccessorAlias, topologyRecovery.LostReplicas.ToCommaDelimitedList(), - topologyRecovery.ParticipatingInstanceKeys.ToCommaDelimitedList(), + `, topologyRecovery.IsSuccessful, + topologyRecovery.SuccessorAlias, strings.Join(topologyRecovery.AllErrors, "\n"), topologyRecovery.UID, ) @@ -439,32 +418,27 @@ func readRecoveries(whereCondition string, limit string, args []any) ([]*Topolog res := []*TopologyRecovery{} query := fmt.Sprintf(` select - recovery_id, - uid, - hostname, - port, - (IFNULL(end_active_period_unixtime, 0) = 0) as is_active, - start_active_period, - IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime, - IFNULL(end_recovery, '') AS end_recovery, - is_successful, - processing_node_hostname, - processcing_node_token, - ifnull(successor_hostname, '') as successor_hostname, - ifnull(successor_port, 0) as successor_port, - ifnull(successor_alias, '') as successor_alias, - analysis, - keyspace, - shard, - count_affected_replicas, - participating_instances, - lost_replicas, - all_errors, - acknowledged, - acknowledged_at, - acknowledged_by, - acknowledge_comment, - last_detection_id + recovery_id, + uid, + alias, + (IFNULL(end_active_period_unixtime, 0) = 0) as is_active, + start_active_period, + IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime, + IFNULL(end_recovery, '') AS end_recovery, + is_successful, + processing_node_hostname, + processcing_node_token, + ifnull(successor_alias, '') as successor_alias, + analysis, + keyspace, + shard, + count_affected_replicas, + all_errors, + acknowledged, + acknowledged_at, + acknowledged_by, + acknowledge_comment, + last_detection_id from topology_recovery %s @@ -484,23 +458,17 @@ func readRecoveries(whereCondition string, limit string, args []any) ([]*Topolog topologyRecovery.ProcessingNodeHostname = m.GetString("processing_node_hostname") topologyRecovery.ProcessingNodeToken = m.GetString("processcing_node_token") - topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") - topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") + topologyRecovery.AnalysisEntry.AnalyzedInstanceAlias = m.GetString("alias") topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) topologyRecovery.AnalysisEntry.ClusterDetails.Keyspace = m.GetString("keyspace") topologyRecovery.AnalysisEntry.ClusterDetails.Shard = m.GetString("shard") topologyRecovery.AnalysisEntry.CountReplicas = m.GetUint("count_affected_replicas") - topologyRecovery.SuccessorKey = &inst.InstanceKey{} - topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname") - topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port") topologyRecovery.SuccessorAlias = m.GetString("successor_alias") topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() topologyRecovery.AllErrors = strings.Split(m.GetString("all_errors"), "\n") - _ = topologyRecovery.LostReplicas.ReadCommaDelimitedList(m.GetString("lost_replicas")) - _ = topologyRecovery.ParticipatingInstanceKeys.ReadCommaDelimitedList(m.GetString("participating_instances")) topologyRecovery.Acknowledged = m.GetBool("acknowledged") topologyRecovery.AcknowledgedAt = m.GetString("acknowledged_at") @@ -533,13 +501,13 @@ func ReadInActivePeriodClusterRecovery(keyspace string, shard, analysis string) // ReadInActivePeriodSuccessorInstanceRecovery reads completed recoveries for a given instance, where said instance // was promoted as result, still in active period (may be used to block further recoveries should this instance die) -func ReadInActivePeriodSuccessorInstanceRecovery(instanceKey *inst.InstanceKey) ([]*TopologyRecovery, error) { +func ReadInActivePeriodSuccessorInstanceRecovery(tabletAlias string) ([]*TopologyRecovery, error) { whereClause := ` where in_active_period=1 and - successor_hostname=? and successor_port=?` - return readRecoveries(whereClause, ``, sqlutils.Args(instanceKey.Hostname, instanceKey.Port)) + successor_alias=?` + return readRecoveries(whereClause, ``, sqlutils.Args(tabletAlias)) } // ReadRecentRecoveries reads latest recovery entries from topology_recovery diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go index f01e16560a8..f9a9026a4a1 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao_test.go +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -22,7 +22,6 @@ import ( "github.com/stretchr/testify/require" "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" ) @@ -39,11 +38,8 @@ func TestTopologyRecovery(t *testing.T) { }() replicationAnalysis := inst.ReplicationAnalysis{ - AnalyzedInstanceKey: inst.InstanceKey{ - Hostname: hostname, - Port: 101, - }, - TabletType: tab101.Type, + AnalyzedInstanceAlias: "zone1-0000000101", + TabletType: tab101.Type, ClusterDetails: inst.ClusterInfo{ Keyspace: keyspace, Shard: shard, @@ -81,10 +77,7 @@ func TestBlockedRecoveryInsertion(t *testing.T) { }() analysisEntry := &inst.ReplicationAnalysis{ - AnalyzedInstanceKey: inst.InstanceKey{ - Hostname: "localhost", - Port: 100, - }, + AnalyzedInstanceAlias: "zone1-0000000100", ClusterDetails: inst.ClusterInfo{ Keyspace: "ks", Shard: "0", diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go index 2945a796fcc..73fa3929eec 100644 --- a/go/vt/vtorc/logic/topology_recovery_test.go +++ b/go/vt/vtorc/logic/topology_recovery_test.go @@ -26,6 +26,7 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" @@ -117,13 +118,10 @@ func TestElectNewPrimaryPanic(t *testing.T) { err = inst.SaveTablet(tablet) require.NoError(t, err) analysisEntry := inst.ReplicationAnalysis{ - AnalyzedInstanceKey: inst.InstanceKey{ - Hostname: tablet.MysqlHostname, - Port: int(tablet.MysqlPort), - }, + AnalyzedInstanceAlias: topoproto.TabletAliasString(tablet.Alias), } ts = memorytopo.NewServer("zone1") - recoveryAttempted, _, err := electNewPrimary(context.Background(), analysisEntry, nil, false, false) + recoveryAttempted, _, err := electNewPrimary(context.Background(), analysisEntry) require.True(t, recoveryAttempted) require.Error(t, err) } @@ -167,18 +165,12 @@ func TestDifferentAnalysescHaveDifferentCooldowns(t *testing.T) { err = inst.SaveTablet(replica) require.NoError(t, err) primaryAnalysisEntry := inst.ReplicationAnalysis{ - AnalyzedInstanceKey: inst.InstanceKey{ - Hostname: primary.MysqlHostname, - Port: int(primary.MysqlPort), - }, - Analysis: inst.ReplicationStopped, + AnalyzedInstanceAlias: topoproto.TabletAliasString(primary.Alias), + Analysis: inst.ReplicationStopped, } replicaAnalysisEntry := inst.ReplicationAnalysis{ - AnalyzedInstanceKey: inst.InstanceKey{ - Hostname: replica.MysqlHostname, - Port: int(replica.MysqlPort), - }, - Analysis: inst.DeadPrimary, + AnalyzedInstanceAlias: topoproto.TabletAliasString(replica.Alias), + Analysis: inst.DeadPrimary, } ts = memorytopo.NewServer("zone1") _, err = AttemptRecoveryRegistration(&replicaAnalysisEntry, false, true) @@ -195,26 +187,17 @@ func TestGetCheckAndRecoverFunctionCode(t *testing.T) { name string ersEnabled bool analysisCode inst.AnalysisCode - analyzedInstanceKey *inst.InstanceKey wantRecoveryFunction recoveryFunction }{ { - name: "DeadPrimary with ERS enabled", - ersEnabled: true, - analysisCode: inst.DeadPrimary, - analyzedInstanceKey: &inst.InstanceKey{ - Hostname: hostname, - Port: 1, - }, + name: "DeadPrimary with ERS enabled", + ersEnabled: true, + analysisCode: inst.DeadPrimary, wantRecoveryFunction: recoverDeadPrimaryFunc, }, { - name: "DeadPrimary with ERS disabled", - ersEnabled: false, - analysisCode: inst.DeadPrimary, - analyzedInstanceKey: &inst.InstanceKey{ - Hostname: hostname, - Port: 1, - }, + name: "DeadPrimary with ERS disabled", + ersEnabled: false, + analysisCode: inst.DeadPrimary, wantRecoveryFunction: noRecoveryFunc, }, { name: "PrimaryHasPrimary", @@ -251,7 +234,7 @@ func TestGetCheckAndRecoverFunctionCode(t *testing.T) { config.SetERSEnabled(tt.ersEnabled) defer config.SetERSEnabled(prevVal) - gotFunc := getCheckAndRecoverFunctionCode(tt.analysisCode, tt.analyzedInstanceKey) + gotFunc := getCheckAndRecoverFunctionCode(tt.analysisCode, "") require.EqualValues(t, tt.wantRecoveryFunction, gotFunc) }) } diff --git a/go/vt/vtorc/logic/orchestrator.go b/go/vt/vtorc/logic/vtorc.go similarity index 82% rename from go/vt/vtorc/logic/orchestrator.go rename to go/vt/vtorc/logic/vtorc.go index c2ce4eef179..90ac9140ece 100644 --- a/go/vt/vtorc/logic/orchestrator.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -47,7 +47,7 @@ const ( // that were requested for discovery. It can be continuously updated // as discovery process progresses. var discoveryQueue *discovery.Queue -var snapshotDiscoveryKeys chan inst.InstanceKey +var snapshotDiscoveryKeys chan string var snapshotDiscoveryKeysMutex sync.Mutex var hasReceivedSIGTERM int32 @@ -65,7 +65,7 @@ var isElectedNode int64 var recentDiscoveryOperationKeys *cache.Cache func init() { - snapshotDiscoveryKeys = make(chan inst.InstanceKey, 10) + snapshotDiscoveryKeys = make(chan string, 10) _ = metrics.Register("discoveries.attempt", discoveriesCounter) _ = metrics.Register("discoveries.fail", failedDiscoveriesCounter) @@ -113,7 +113,7 @@ func acceptSighupSignal() { go func() { for range c { log.Infof("Received SIGHUP. Reloading configuration") - _ = inst.AuditOperation("reload-configuration", nil, "Triggered via SIGHUP") + _ = inst.AuditOperation("reload-configuration", "", "Triggered via SIGHUP") config.Reload() discoveryMetrics.SetExpirePeriod(time.Duration(config.DiscoveryCollectionRetentionSeconds) * time.Second) } @@ -126,7 +126,7 @@ func closeVTOrc() { atomic.StoreInt32(&hasReceivedSIGTERM, 1) discoveryMetrics.StopAutoExpiration() // Poke other go routines to stop cleanly here ... - _ = inst.AuditOperation("shutdown", nil, "Triggered via SIGTERM") + _ = inst.AuditOperation("shutdown", "", "Triggered via SIGTERM") // wait for the locks to be released waitForLocksRelease() log.Infof("VTOrc closed") @@ -159,18 +159,18 @@ func handleDiscoveryRequests() { for i := uint(0); i < config.DiscoveryMaxConcurrency; i++ { go func() { for { - instanceKey := discoveryQueue.Consume() + tabletAlias := discoveryQueue.Consume() // Possibly this used to be the elected node, but has // been demoted, while still the queue is full. if !IsLeaderOrActive() { log.Infof("Node apparently demoted. Skipping discovery of %+v. "+ - "Remaining queue size: %+v", instanceKey, discoveryQueue.QueueLen()) - discoveryQueue.Release(instanceKey) + "Remaining queue size: %+v", tabletAlias, discoveryQueue.QueueLen()) + discoveryQueue.Release(tabletAlias) continue } - DiscoverInstance(instanceKey, false /* forceDiscovery */) - discoveryQueue.Release(instanceKey) + DiscoverInstance(tabletAlias, false /* forceDiscovery */) + discoveryQueue.Release(tabletAlias) } }() } @@ -179,9 +179,9 @@ func handleDiscoveryRequests() { // DiscoverInstance will attempt to discover (poll) an instance (unless // it is already up-to-date) and will also ensure that its primary and // replicas (if any) are also checked. -func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { - if inst.InstanceIsForgotten(&instanceKey) { - log.Infof("discoverInstance: skipping discovery of %+v because it is set to be forgotten", instanceKey) +func DiscoverInstance(tabletAlias string, forceDiscovery bool) { + if inst.InstanceIsForgotten(tabletAlias) { + log.Infof("discoverInstance: skipping discovery of %+v because it is set to be forgotten", tabletAlias) return } @@ -198,28 +198,27 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { discoveryTime := latency.Elapsed("total") if discoveryTime > instancePollSecondsDuration() { instancePollSecondsExceededCounter.Inc(1) - log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", instanceKey, discoveryTime.Seconds()) + log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", tabletAlias, discoveryTime.Seconds()) if metric != nil { metric.InstancePollSecondsDurationCount = 1 } } }() - _, _ = instanceKey.ResolveHostname() - if !instanceKey.IsValid() { + if tabletAlias == "" { return } // Calculate the expiry period each time as InstancePollSeconds // _may_ change during the run of the process (via SIGHUP) and // it is not possible to change the cache's default expiry.. - if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, instancePollSecondsDuration()); existsInCacheError != nil && !forceDiscovery { + if existsInCacheError := recentDiscoveryOperationKeys.Add(tabletAlias, true, instancePollSecondsDuration()); existsInCacheError != nil && !forceDiscovery { // Just recently attempted return } latency.Start("backend") - instance, found, _ := inst.ReadInstance(&instanceKey) + instance, found, _ := inst.ReadInstance(tabletAlias) latency.Stop("backend") if !forceDiscovery && found && instance.IsUpToDate && instance.IsLastCheckValid { // we've already discovered this one. Skip! @@ -229,7 +228,7 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { discoveriesCounter.Inc(1) // First we've ever heard of this instance. Continue investigation: - instance, err := inst.ReadTopologyInstanceBufferable(&instanceKey, latency) + instance, err := inst.ReadTopologyInstanceBufferable(tabletAlias, latency) // panic can occur (IO stuff). Therefore it may happen // that instance is nil. Check it, but first get the timing metrics. totalLatency := latency.Elapsed("total") @@ -244,16 +243,16 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { failedDiscoveriesCounter.Inc(1) metric = &discovery.Metric{ Timestamp: time.Now(), - InstanceKey: instanceKey, + TabletAlias: tabletAlias, TotalLatency: totalLatency, BackendLatency: backendLatency, InstanceLatency: instanceLatency, Err: err, } _ = discoveryMetrics.Append(metric) - if util.ClearToLog("discoverInstance", instanceKey.StringCode()) { + if util.ClearToLog("discoverInstance", tabletAlias) { log.Warningf(" DiscoverInstance(%+v) instance is nil in %.3fs (Backend: %.3fs, Instance: %.3fs), error=%+v", - instanceKey, + tabletAlias, totalLatency.Seconds(), backendLatency.Seconds(), instanceLatency.Seconds(), @@ -264,7 +263,7 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { metric = &discovery.Metric{ Timestamp: time.Now(), - InstanceKey: instanceKey, + TabletAlias: tabletAlias, TotalLatency: totalLatency, BackendLatency: backendLatency, InstanceLatency: instanceLatency, @@ -297,7 +296,7 @@ func onHealthTick() { if !IsLeaderOrActive() { return } - instanceKeys, err := inst.ReadOutdatedInstanceKeys() + tabletAliases, err := inst.ReadOutdatedInstanceKeys() if err != nil { log.Error(err) } @@ -307,9 +306,6 @@ func onHealthTick() { go func() { _, _ = process.RegisterNode(process.ThisNodeHealth) }() - go func() { - _ = inst.ExpireMaintenance() - }() } func() { @@ -320,14 +316,14 @@ func onHealthTick() { countSnapshotKeys := len(snapshotDiscoveryKeys) for i := 0; i < countSnapshotKeys; i++ { - instanceKeys = append(instanceKeys, <-snapshotDiscoveryKeys) + tabletAliases = append(tabletAliases, <-snapshotDiscoveryKeys) } }() // avoid any logging unless there's something to be done - if len(instanceKeys) > 0 { - for _, instanceKey := range instanceKeys { - if instanceKey.IsValid() { - discoveryQueue.Push(instanceKey) + if len(tabletAliases) > 0 { + for _, tabletAlias := range tabletAliases { + if tabletAlias != "" { + discoveryQueue.Push(tabletAlias) } } } @@ -343,11 +339,9 @@ func ContinuousDiscovery() { checkAndRecoverWaitPeriod := 3 * instancePollSecondsDuration() recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) - _ = inst.LoadHostnameResolveCache() go handleDiscoveryRequests() healthTick := time.Tick(config.HealthPollSeconds * time.Second) - instancePollTick := time.Tick(instancePollSecondsDuration()) caretakingTick := time.Tick(time.Minute) recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) tabletTopoTick := OpenTabletDiscovery() @@ -375,39 +369,19 @@ func ContinuousDiscovery() { go func() { onHealthTick() }() - case <-instancePollTick: - go func() { - // This tick does NOT do instance poll (these are handled by the oversampling discoveryTick) - // But rather should invoke such routinely operations that need to be as (or roughly as) frequent - // as instance poll - if IsLeaderOrActive() { - go inst.ExpireDowntime() - } - }() case <-caretakingTick: // Various periodic internal maintenance tasks go func() { if IsLeaderOrActive() { go inst.ForgetLongUnseenInstances() - go inst.ForgetUnseenInstancesDifferentlyResolved() - go inst.ForgetExpiredHostnameResolves() - go inst.DeleteInvalidHostnameResolves() - go inst.ResolveUnknownPrimaryHostnameResolves() - go inst.ExpireMaintenance() - go inst.ExpireCandidateInstances() - go inst.ExpireHostnameUnresolve() go inst.ExpireAudit() - go inst.FlushNontrivialResolveCacheToDatabase() go inst.ExpireStaleInstanceBinlogCoordinates() go process.ExpireNodesHistory() go process.ExpireAvailableNodes() go ExpireFailureDetectionHistory() go ExpireTopologyRecoveryHistory() go ExpireTopologyRecoveryStepsHistory() - } else { - // Take this opportunity to refresh yourself - go inst.LoadHostnameResolveCache() } }() case <-recoveryTick: @@ -427,7 +401,7 @@ func ContinuousDiscovery() { return } if runCheckAndRecoverOperationsTimeRipe() { - CheckAndRecover(nil, nil, false) + CheckAndRecover() } else { log.Infof("Waiting for %+v seconds to pass before running failure detection/recovery", checkAndRecoverWaitPeriod.Seconds()) } diff --git a/go/vt/vtorc/logic/orchestrator_test.go b/go/vt/vtorc/logic/vtorc_test.go similarity index 100% rename from go/vt/vtorc/logic/orchestrator_test.go rename to go/vt/vtorc/logic/vtorc_test.go diff --git a/go/vt/vtorc/server/discovery.go b/go/vt/vtorc/server/discovery.go index 1f0011cfabd..2ef81eea3c4 100644 --- a/go/vt/vtorc/server/discovery.go +++ b/go/vt/vtorc/server/discovery.go @@ -20,7 +20,6 @@ import ( "github.com/spf13/pflag" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtorc/inst" "vitess.io/vitess/go/vt/vtorc/logic" "vitess.io/vitess/go/vt/vtorc/process" ) @@ -34,7 +33,6 @@ func RegisterFlags(fs *pflag.FlagSet) { // StartVTOrcDiscovery starts VTOrc discovery serving func StartVTOrcDiscovery() { process.ContinuousRegistration(string(process.VTOrcExecutionHTTPMode), "") - inst.SetMaintenanceOwner(process.ThisHostname) log.Info("Starting Discovery") go logic.ContinuousDiscovery() diff --git a/go/vt/vtorc/test/recovery_analysis.go b/go/vt/vtorc/test/recovery_analysis.go index 7585fa17441..6b90e9aa9f7 100644 --- a/go/vt/vtorc/test/recovery_analysis.go +++ b/go/vt/vtorc/test/recovery_analysis.go @@ -40,8 +40,6 @@ type InfoForRecoveryAnalysis struct { IsCoPrimary int Hostname string Port int - SourceHost string - SourcePort int DataCenter string Region string PhysicalEnvironment string @@ -148,8 +146,6 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap { rowMap["semi_sync_primary_status"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryStatus), Valid: true} rowMap["semi_sync_primary_wait_for_replica_count"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryWaitForReplicaCount), Valid: true} rowMap["semi_sync_replica_enabled"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncReplicaEnabled), Valid: true} - rowMap["source_host"] = sqlutils.CellData{String: info.SourceHost, Valid: true} - rowMap["source_port"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SourcePort), Valid: true} res, _ := prototext.Marshal(info.TabletInfo) rowMap["tablet_info"] = sqlutils.CellData{String: string(res), Valid: true} return rowMap