etcdserver: renaming db happens before snapshot persists to wal and s…

…nap files In the case that follower recieves a snapshot from leader and crashes before renaming xxx.snap.db to db but after snapshot has persisted to .wal and .snap, restarting follower results loading old db, new .wal, and new .snap. This will causes a index mismatch between snap metadata index and consistent index from db. This pr forces an ordering where saving/renaming db must happen before snapshot is persisted to wal and snap file. this ensures that db file can never be newer than wal and snap file. hence, it guarantees the invariant snapshot.Metadata.Index <= db.ConsistentIndex() in NewServer() when checking validity of db and snap file. FIXES etcd-io#7628
fanminshi · May 5, 2017 · ff112fe · ff112fe
1 parent 505bf8c
commit ff112fe
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 8 deletions.
diff --git a/etcdserver/raft.go b/etcdserver/raft.go
@@ -83,7 +83,8 @@ type RaftTimer interface {
 type apply struct {
 	entries  []raftpb.Entry
 	snapshot raftpb.Snapshot
-	raftDone <-chan struct{} // rx {} after raft has persisted messages
+	// notifyc synchronizes etcd server applies with the raft node
+	notifyc chan struct{}
 }
 
 type raftNode struct {
@@ -190,11 +191,11 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					}
 				}
 
-				raftDone := make(chan struct{}, 1)
+				notifyc := make(chan struct{}, 1)
 				ap := apply{
 					entries:  rd.CommittedEntries,
 					snapshot: rd.Snapshot,
-					raftDone: raftDone,
+					notifyc:  notifyc,
 				}
 
 				updateCommittedIndex(&ap, rh)
@@ -223,6 +224,8 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 				// gofail: var raftAfterSave struct{}
 
 				if !raft.IsEmptySnap(rd.Snapshot) {
+					// wait for snapshot db to be renamed to in-use db
+					<-notifyc
 					// gofail: var raftBeforeSaveSnap struct{}
 					if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
 						plog.Fatalf("raft save snapshot error: %v", err)
@@ -240,7 +243,7 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					msgs := r.processMessages(rd.Messages)
 
 					// now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
-					raftDone <- struct{}{}
+					notifyc <- struct{}{}
 
 					// Candidate or follower needs to wait for all pending configuration
 					// changes to be applied before sending messages.
@@ -259,9 +262,9 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					if waitApply {
 						// blocks until 'applyAll' calls 'applyWait.Trigger'
 						// to be in sync with scheduled config-change job
-						// (assume raftDone has cap of 1)
+						// (assume notifyc has cap of 1)
 						select {
-						case raftDone <- struct{}{}:
+						case notifyc <- struct{}{}:
 						case <-r.stopped:
 							return
 						}
@@ -271,7 +274,7 @@ func (r *raftNode) start(rh *raftReadyHandler) {
 					r.transport.Send(msgs)
 				} else {
 					// leader already processed 'MsgSnap' and signaled
-					raftDone <- struct{}{}
+					notifyc <- struct{}{}
 				}
 
 				r.Advance()

diff --git a/etcdserver/server.go b/etcdserver/server.go
@@ -778,7 +778,7 @@ func (s *EtcdServer) applyAll(ep *etcdProgress, apply *apply) {
 	// wait for the raft routine to finish the disk writes before triggering a
 	// snapshot. or applied index might be greater than the last index in raft
 	// storage, since the raft routine might be slower than apply routine.
-	<-apply.raftDone
+	<-apply.notifyc
 
 	s.triggerSnapshot(ep)
 	select {
@@ -812,6 +812,8 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) {
 	if err := os.Rename(snapfn, fn); err != nil {
 		plog.Panicf("rename snapshot file error: %v", err)
 	}
+	// raft can now claim the snapshot has been applied
+	apply.notifyc <- struct{}{}
 
 	newbe := newBackend(fn, s.Cfg.QuotaBackendBytes)