Skip to content

Commit

Permalink
fix(controller): cache deadlock on delete and re-add cluster (cherry-…
Browse files Browse the repository at this point in the history
…pick #14780) (#14815)

Signed-off-by: Alexander Matyushentsev <AMatyushentsev@gmail.com>
  • Loading branch information
alexmt authored Aug 1, 2023
1 parent d7c2dd5 commit 3ab4b2b
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 2 deletions.
6 changes: 4 additions & 2 deletions controller/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,12 +697,14 @@ func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *a
}

func (c *liveStateCache) handleDeleteEvent(clusterServer string) {
c.lock.Lock()
defer c.lock.Unlock()
c.lock.RLock()
cluster, ok := c.clusters[clusterServer]
c.lock.RUnlock()
if ok {
cluster.Invalidate()
c.lock.Lock()
delete(c.clusters, clusterServer)
c.lock.Unlock()
}
}

Expand Down
97 changes: 97 additions & 0 deletions controller/cache/cache_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package cache

import (
"context"
"errors"
"net"
"net/url"
"sync"
"testing"
"time"

apierr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand All @@ -14,8 +17,10 @@ import (
"github.com/argoproj/gitops-engine/pkg/cache"
"github.com/argoproj/gitops-engine/pkg/cache/mocks"
"github.com/stretchr/testify/mock"
"k8s.io/client-go/kubernetes/fake"

appv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1"
argosettings "github.com/argoproj/argo-cd/v2/util/settings"
)

type netError string
Expand Down Expand Up @@ -106,6 +111,98 @@ func TestHandleAddEvent_ClusterExcluded(t *testing.T) {
assert.Len(t, clustersCache.clusters, 0)
}

func TestHandleDeleteEvent_CacheDeadlock(t *testing.T) {
testCluster := &appv1.Cluster{
Server: "https://mycluster",
Config: appv1.ClusterConfig{Username: "bar"},
}
fakeClient := fake.NewSimpleClientset()
settingsMgr := argosettings.NewSettingsManager(context.TODO(), fakeClient, "argocd")
externalLockRef := sync.RWMutex{}
gitopsEngineClusterCache := &mocks.ClusterCache{}
clustersCache := liveStateCache{
clusters: map[string]cache.ClusterCache{
testCluster.Server: gitopsEngineClusterCache,
},
clusterFilter: func(cluster *appv1.Cluster) bool {
return true
},
settingsMgr: settingsMgr,
// Set the lock here so we can reference it later
// nolint We need to overwrite here to have access to the lock
lock: externalLockRef,
}
channel := make(chan string)
// Mocked lock held by the gitops-engine cluster cache
mockMutex := sync.RWMutex{}
// Locks to force trigger condition during test
// Condition order:
// EnsuredSynced -> Locks gitops-engine
// handleDeleteEvent -> Locks liveStateCache
// EnsureSynced via sync, newResource, populateResourceInfoHandler -> attempts to Lock liveStateCache
// handleDeleteEvent via cluster.Invalidate -> attempts to Lock gitops-engine
handleDeleteWasCalled := sync.Mutex{}
engineHoldsLock := sync.Mutex{}
handleDeleteWasCalled.Lock()
engineHoldsLock.Lock()
gitopsEngineClusterCache.On("EnsureSynced").Run(func(args mock.Arguments) {
// Held by EnsureSync calling into sync and watchEvents
mockMutex.Lock()
defer mockMutex.Unlock()
// Continue Execution of timer func
engineHoldsLock.Unlock()
// Wait for handleDeleteEvent to be called triggering the lock
// on the liveStateCache
handleDeleteWasCalled.Lock()
t.Logf("handleDelete was called, EnsureSynced continuing...")
handleDeleteWasCalled.Unlock()
// Try and obtain the lock on the liveStateCache
alreadyFailed := !externalLockRef.TryLock()
if alreadyFailed {
channel <- "DEADLOCKED -- EnsureSynced could not obtain lock on liveStateCache"
return
}
externalLockRef.Lock()
t.Logf("EnsureSynce was able to lock liveStateCache")
externalLockRef.Unlock()
}).Return(nil).Once()
gitopsEngineClusterCache.On("Invalidate").Run(func(args mock.Arguments) {
// If deadlock is fixed should be able to acquire lock here
alreadyFailed := !mockMutex.TryLock()
if alreadyFailed {
channel <- "DEADLOCKED -- Invalidate could not obtain lock on gitops-engine"
return
}
mockMutex.Lock()
t.Logf("Invalidate was able to lock gitops-engine cache")
mockMutex.Unlock()
}).Return()
go func() {
// Start the gitops-engine lock holds
go func() {
err := gitopsEngineClusterCache.EnsureSynced()
if err != nil {
assert.Fail(t, err.Error())
}
}()
// Wait for EnsureSynced to grab the lock for gitops-engine
engineHoldsLock.Lock()
t.Log("EnsureSynced has obtained lock on gitops-engine")
engineHoldsLock.Unlock()
// Run in background
go clustersCache.handleDeleteEvent(testCluster.Server)
// Allow execution to continue on clusters cache call to trigger lock
handleDeleteWasCalled.Unlock()
channel <- "PASSED"
}()
select {
case str := <-channel:
assert.Equal(t, "PASSED", str, str)
case <-time.After(5 * time.Second):
assert.Fail(t, "Ended up in deadlock")
}
}

func TestIsRetryableError(t *testing.T) {
var (
tlsHandshakeTimeoutErr net.Error = netError("net/http: TLS handshake timeout")
Expand Down

0 comments on commit 3ab4b2b

Please sign in to comment.