-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
store/tikv: avoid holding write lock for long time #6880
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -449,33 +449,38 @@ func (c *RegionCache) loadStoreAddr(bo *Backoffer, id uint64) (string, error) { | |
} | ||
} | ||
|
||
// OnRequestFail is used for clearing cache when a tikv server does not respond. | ||
func (c *RegionCache) OnRequestFail(ctx *RPCContext, err error) { | ||
// Switch region's leader peer to next one. | ||
regionID := ctx.Region | ||
// DropStoreOnSendRequestFail is used for clearing cache when a tikv server does not respond. | ||
func (c *RegionCache) DropStoreOnSendRequestFail(ctx *RPCContext, err error) { | ||
// We need to drop the store only when the request is the first one failed on this store. | ||
// Because too many concurrently requests trying to drop the store will be blocked on the lock. | ||
failedRegionID := ctx.Region | ||
failedStoreID := ctx.Peer.StoreId | ||
c.mu.Lock() | ||
if cachedregion, ok := c.mu.regions[regionID]; ok { | ||
region := cachedregion.region | ||
if !region.OnRequestFail(ctx.Peer.GetStoreId()) { | ||
c.dropRegionFromCache(regionID) | ||
} | ||
_, ok := c.mu.regions[failedRegionID] | ||
if !ok { | ||
// The failed region is dropped already by another request, we don't need to iterate the regions | ||
// and find regions on the failed store to drop. | ||
c.mu.Unlock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use defer to unlock? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we can do something out of lock at the end of this function. |
||
return | ||
} | ||
c.mu.Unlock() | ||
// Store's meta may be out of date. | ||
storeID := ctx.Peer.GetStoreId() | ||
c.storeMu.Lock() | ||
delete(c.storeMu.stores, storeID) | ||
c.storeMu.Unlock() | ||
|
||
log.Infof("drop regions of store %d from cache due to request fail, err: %v", storeID, err) | ||
|
||
c.mu.Lock() | ||
for id, r := range c.mu.regions { | ||
if r.region.peer.GetStoreId() == storeID { | ||
if r.region.peer.GetStoreId() == failedStoreID { | ||
c.dropRegionFromCache(id) | ||
} | ||
} | ||
c.mu.Unlock() | ||
|
||
// Store's meta may be out of date. | ||
var failedStoreAddr string | ||
c.storeMu.Lock() | ||
store, ok := c.storeMu.stores[failedStoreID] | ||
if ok { | ||
failedStoreAddr = store.Addr | ||
delete(c.storeMu.stores, failedStoreID) | ||
} | ||
c.storeMu.Unlock() | ||
log.Infof("drop regions that on the store %d(%s) due to send request fail, err: %v", | ||
failedStoreID, failedStoreAddr, err) | ||
} | ||
|
||
// OnRegionStale removes the old region and inserts new regions into the cache. | ||
|
@@ -531,9 +536,8 @@ func (item *btreeItem) Less(other btree.Item) bool { | |
|
||
// Region stores region's meta and its leader peer. | ||
type Region struct { | ||
meta *metapb.Region | ||
peer *metapb.Peer | ||
unreachableStores []uint64 | ||
meta *metapb.Region | ||
peer *metapb.Peer | ||
} | ||
|
||
// GetID returns id. | ||
|
@@ -581,26 +585,6 @@ func (r *Region) GetContext() *kvrpcpb.Context { | |
} | ||
} | ||
|
||
// OnRequestFail records unreachable peer and tries to select another valid peer. | ||
// It returns false if all peers are unreachable. | ||
func (r *Region) OnRequestFail(storeID uint64) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why remove this? Is this logic useless or moved to another place? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's useless. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some considerations for using unreachable store list. Consider a store is down, another peer of the region becomes the leader, but somehow the new leader is not able to send heartbeat to PD in time. With the unreachable store list, tidb can try the other peers automatically. Otherwise, it will continue to reconnect the down tikv until timeout. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @disksing |
||
if r.peer.GetStoreId() != storeID { | ||
return true | ||
} | ||
r.unreachableStores = append(r.unreachableStores, storeID) | ||
L: | ||
for _, p := range r.meta.Peers { | ||
for _, id := range r.unreachableStores { | ||
if p.GetStoreId() == id { | ||
continue L | ||
} | ||
} | ||
r.peer = p | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
// SwitchPeer switches current peer to the one on specific store. It returns | ||
// false if no peer matches the storeID. | ||
func (r *Region) SwitchPeer(storeID uint64) bool { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this the ID of a peer or store?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
peer, it was mistakenly name
leaderStoreID
.