Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Supporting UnreachableIntermediateMasterWithLaggingReplicas #1005

Merged
merged 6 commits into from
Nov 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/failure-detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Observe the following list of potential failures:
* DeadIntermediateMasterAndSlaves
* AllIntermediateMasterSlavesFailingToConnectOrDead
* AllIntermediateMasterSlavesNotReplicating
* UnreachableIntermediateMasterWithLaggingReplicas
* UnreachableIntermediateMaster
* BinlogServerFailingToConnectToMaster

Expand Down
1 change: 1 addition & 0 deletions go/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const (
DeadIntermediateMasterWithSingleSlaveFailingToConnect = "DeadIntermediateMasterWithSingleSlaveFailingToConnect"
DeadIntermediateMasterAndSomeSlaves = "DeadIntermediateMasterAndSomeSlaves"
DeadIntermediateMasterAndSlaves = "DeadIntermediateMasterAndSlaves"
UnreachableIntermediateMasterWithLaggingReplicas = "UnreachableIntermediateMasterWithLaggingReplicas"
UnreachableIntermediateMaster = "UnreachableIntermediateMaster"
AllIntermediateMasterSlavesFailingToConnectOrDead = "AllIntermediateMasterSlavesFailingToConnectOrDead"
AllIntermediateMasterSlavesNotReplicating = "AllIntermediateMasterSlavesNotReplicating"
Expand Down
4 changes: 4 additions & 0 deletions go/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.Analysis = DeadIntermediateMasterAndSlaves
a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are unreachable"
//
} else if !a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 {
a.Analysis = UnreachableIntermediateMasterWithLaggingReplicas
a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are lagging"
//
} else if !a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
a.Analysis = UnreachableIntermediateMaster
a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
Expand Down
4 changes: 4 additions & 0 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,8 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance
return checkAndRecoverGenericProblem, false
case inst.AllMasterSlavesNotReplicatingOrDead:
return checkAndRecoverGenericProblem, false
case inst.UnreachableIntermediateMasterWithLaggingReplicas:
return checkAndRecoverGenericProblem, false
}
// Right now this is mostly causing noise with no clear action.
// Will revisit this in the future.
Expand All @@ -1524,6 +1526,8 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) {
go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
case inst.UnreachableMasterWithLaggingReplicas:
go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
case inst.UnreachableIntermediateMasterWithLaggingReplicas:
go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
case inst.AllMasterSlavesNotReplicating:
go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
case inst.AllMasterSlavesNotReplicatingOrDead:
Expand Down
1 change: 1 addition & 0 deletions resources/public/js/cluster-analysis-shared.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ var interestingAnalysis = {
"DeadIntermediateMasterAndSlaves" : true,
"AllIntermediateMasterSlavesFailingToConnectOrDead" : true,
"AllIntermediateMasterSlavesNotReplicating" : true,
"UnreachableIntermediateMasterWithLaggingReplicas": true,
"UnreachableIntermediateMaster" : true,
"BinlogServerFailingToConnectToMaster" : true,
};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- 22295 replicates from 22294
UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22294;

UPDATE database_instance SET slave_lag_seconds=60 where master_port=22294;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testhost:22294 (cluster testhost:22293): UnreachableIntermediateMasterWithLaggingReplicas
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-c replication-analysis