Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(controller): cache deadlock on delete and re-add cluster #14780

Merged
merged 2 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions controller/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -775,12 +775,14 @@ func (c *liveStateCache) handleModEvent(oldCluster *appv1.Cluster, newCluster *a
}

func (c *liveStateCache) handleDeleteEvent(clusterServer string) {
c.lock.Lock()
defer c.lock.Unlock()
c.lock.RLock()
cluster, ok := c.clusters[clusterServer]
c.lock.RUnlock()
if ok {
cluster.Invalidate()
c.lock.Lock()
delete(c.clusters, clusterServer)
c.lock.Unlock()
}
}

Expand Down
99 changes: 98 additions & 1 deletion controller/cache/cache_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package cache

import (
"context"
"errors"
"net"
"net/url"
"sync"
"testing"
"time"

"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apierr "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand All @@ -16,8 +19,10 @@ import (
"github.com/argoproj/gitops-engine/pkg/cache/mocks"
"github.com/argoproj/gitops-engine/pkg/health"
"github.com/stretchr/testify/mock"
"k8s.io/client-go/kubernetes/fake"

appv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1"
argosettings "github.com/argoproj/argo-cd/v2/util/settings"
)

type netError string
Expand Down Expand Up @@ -108,6 +113,98 @@ func TestHandleAddEvent_ClusterExcluded(t *testing.T) {
assert.Len(t, clustersCache.clusters, 0)
}

func TestHandleDeleteEvent_CacheDeadlock(t *testing.T) {
testCluster := &appv1.Cluster{
Server: "https://mycluster",
Config: appv1.ClusterConfig{Username: "bar"},
}
fakeClient := fake.NewSimpleClientset()
settingsMgr := argosettings.NewSettingsManager(context.TODO(), fakeClient, "argocd")
externalLockRef := sync.RWMutex{}
gitopsEngineClusterCache := &mocks.ClusterCache{}
clustersCache := liveStateCache{
clusters: map[string]cache.ClusterCache{
testCluster.Server: gitopsEngineClusterCache,
},
clusterFilter: func(cluster *appv1.Cluster) bool {
return true
},
settingsMgr: settingsMgr,
// Set the lock here so we can reference it later
// nolint We need to overwrite here to have access to the lock
lock: externalLockRef,
}
channel := make(chan string)
// Mocked lock held by the gitops-engine cluster cache
mockMutex := sync.RWMutex{}
// Locks to force trigger condition during test
// Condition order:
// EnsuredSynced -> Locks gitops-engine
// handleDeleteEvent -> Locks liveStateCache
// EnsureSynced via sync, newResource, populateResourceInfoHandler -> attempts to Lock liveStateCache
// handleDeleteEvent via cluster.Invalidate -> attempts to Lock gitops-engine
handleDeleteWasCalled := sync.Mutex{}
engineHoldsLock := sync.Mutex{}
handleDeleteWasCalled.Lock()
engineHoldsLock.Lock()
gitopsEngineClusterCache.On("EnsureSynced").Run(func(args mock.Arguments) {
// Held by EnsureSync calling into sync and watchEvents
mockMutex.Lock()
defer mockMutex.Unlock()
// Continue Execution of timer func
engineHoldsLock.Unlock()
// Wait for handleDeleteEvent to be called triggering the lock
// on the liveStateCache
handleDeleteWasCalled.Lock()
t.Logf("handleDelete was called, EnsureSynced continuing...")
handleDeleteWasCalled.Unlock()
// Try and obtain the lock on the liveStateCache
alreadyFailed := !externalLockRef.TryLock()
if alreadyFailed {
channel <- "DEADLOCKED -- EnsureSynced could not obtain lock on liveStateCache"
return
}
externalLockRef.Lock()
t.Logf("EnsureSynce was able to lock liveStateCache")
externalLockRef.Unlock()
}).Return(nil).Once()
gitopsEngineClusterCache.On("Invalidate").Run(func(args mock.Arguments) {
// If deadlock is fixed should be able to acquire lock here
alreadyFailed := !mockMutex.TryLock()
if alreadyFailed {
channel <- "DEADLOCKED -- Invalidate could not obtain lock on gitops-engine"
return
}
mockMutex.Lock()
t.Logf("Invalidate was able to lock gitops-engine cache")
mockMutex.Unlock()
}).Return()
go func() {
// Start the gitops-engine lock holds
go func() {
err := gitopsEngineClusterCache.EnsureSynced()
if err != nil {
assert.Fail(t, err.Error())
}
}()
// Wait for EnsureSynced to grab the lock for gitops-engine
engineHoldsLock.Lock()
t.Log("EnsureSynced has obtained lock on gitops-engine")
engineHoldsLock.Unlock()
// Run in background
go clustersCache.handleDeleteEvent(testCluster.Server)
// Allow execution to continue on clusters cache call to trigger lock
handleDeleteWasCalled.Unlock()
channel <- "PASSED"
}()
select {
case str := <-channel:
assert.Equal(t, "PASSED", str, str)
case <-time.After(5 * time.Second):
assert.Fail(t, "Ended up in deadlock")
}
}

func TestIsRetryableError(t *testing.T) {
var (
tlsHandshakeTimeoutErr net.Error = netError("net/http: TLS handshake timeout")
Expand Down