From 4eb1da09586191a9e6ca0d320471c94cb30896d5 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Wed, 29 May 2024 12:52:52 -0400 Subject: [PATCH] remove deadlock in GetOrHandshake (#1151) We had a rare deadlock in GetOrHandshake because we kept the hostmap lock when we do the call to StartHandshake. StartHandshake can block while sending to the lighthouse query worker channel, and that worker needs to be able to grab the hostmap lock to do its work. Other calls for StartHandshake don't hold the hostmap lock so we should be able to drop it here. This lock was originally added with: https://github.com/slackhq/nebula/pull/954 --- handshake_manager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/handshake_manager.go b/handshake_manager.go index 640227a7e..2372ced09 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -356,10 +356,11 @@ func (hm *HandshakeManager) handleOutbound(vpnIp iputil.VpnIp, lighthouseTrigger // GetOrHandshake will try to find a hostinfo with a fully formed tunnel or start a new handshake if one is not present // The 2nd argument will be true if the hostinfo is ready to transmit traffic func (hm *HandshakeManager) GetOrHandshake(vpnIp iputil.VpnIp, cacheCb func(*HandshakeHostInfo)) (*HostInfo, bool) { - // Check the main hostmap and maintain a read lock if our host is not there hm.mainHostMap.RLock() - if h, ok := hm.mainHostMap.Hosts[vpnIp]; ok { - hm.mainHostMap.RUnlock() + h, ok := hm.mainHostMap.Hosts[vpnIp] + hm.mainHostMap.RUnlock() + + if ok { // Do not attempt promotion if you are a lighthouse if !hm.lightHouse.amLighthouse { h.TryPromoteBest(hm.mainHostMap.GetPreferredRanges(), hm.f) @@ -367,7 +368,6 @@ func (hm *HandshakeManager) GetOrHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han return h, true } - defer hm.mainHostMap.RUnlock() return hm.StartHandshake(vpnIp, cacheCb), false }