From 37d3093ba4df655042cc4104856417e6fa202985 Mon Sep 17 00:00:00 2001 From: Lennart Altenhof Date: Sun, 5 Nov 2023 19:32:02 +0100 Subject: [PATCH 1/4] fix(reaper): fix race condition when reusing reapers Reaper reuse/creation logic has been adjusted to facilitate the reuse of already running reapers. This includes using a specific naming convention based on the session id, and handling failed container creation attempts due to name conflicts by retrieving the already running reaper container. This fixes a race condition when tests are run in parallel in multiple packages which renders global locks ineffective. --- reaper.go | 79 +++++++++++++++++++++++++++++++++++++++++--------- reaper_test.go | 46 +++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 13 deletions(-) diff --git a/reaper.go b/reaper.go index f5d571f2c0..17ff54576a 100644 --- a/reaper.go +++ b/reaper.go @@ -6,6 +6,7 @@ import ( "fmt" "math/rand" "net" + "regexp" "strings" "sync" "time" @@ -50,6 +51,14 @@ func NewReaper(ctx context.Context, sessionID string, provider ReaperProvider, r return reuseOrCreateReaper(ctx, sessionID, provider, WithImageName(reaperImageName)) } +// reaperContainerNameFromSessionID returns the container name that uniquely +// identifies the container based on the session id. +func reaperContainerNameFromSessionID(sessionID string) string { + // The session id is 64 characters, so we will not hit the limit of 128 + // characters for container names. + return fmt.Sprintf("reaper_%s", sessionID) +} + // lookUpReaperContainer returns a DockerContainer type with the reaper container in the case // it's found in the running state, and including the labels for sessionID, reaper, and ryuk. // It will perform a retry with exponential backoff to allow for the container to be started and @@ -67,7 +76,7 @@ func lookUpReaperContainer(ctx context.Context, sessionID string) (*DockerContai // we want random intervals between 100ms and 500ms for concurrent executions // to not be synchronized: it could be the case that multiple executions of this - // function happen at the same time (specially when called from a different test + // function happen at the same time (specifically when called from a different test // process execution), and we want to avoid that they all try to find the reaper // container at the same time. exp.InitialInterval = time.Duration(rand.Intn(5)*100) * time.Millisecond @@ -82,6 +91,7 @@ func lookUpReaperContainer(ctx context.Context, sessionID string) (*DockerContai filters.Arg("label", fmt.Sprintf("%s=%s", testcontainersdocker.LabelSessionID, sessionID)), filters.Arg("label", fmt.Sprintf("%s=%t", testcontainersdocker.LabelReaper, true)), filters.Arg("label", fmt.Sprintf("%s=%t", testcontainersdocker.LabelRyuk, true)), + filters.Arg("name", reaperContainerNameFromSessionID(sessionID)), } resp, err := dockerClient.ContainerList(ctx, types.ContainerListOptions{ @@ -146,19 +156,11 @@ func reuseOrCreateReaper(ctx context.Context, sessionID string, provider ReaperP reaperContainer, err := lookUpReaperContainer(context.Background(), sessionID) if err == nil && reaperContainer != nil { // The reaper container exists as a Docker container: re-use it - endpoint, err := reaperContainer.PortEndpoint(ctx, "8080", "") + Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) + reaperInstance, err = reuseReaperContainer(ctx, sessionID, provider, reaperContainer) if err != nil { return nil, err } - - Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) - reaperInstance = &Reaper{ - Provider: provider, - SessionID: sessionID, - Endpoint: endpoint, - container: reaperContainer, - } - return reaperInstance, nil } @@ -182,8 +184,25 @@ func reuseOrCreateReaper(ctx context.Context, sessionID string, provider ReaperP return reaperInstance, nil } -// newReaper creates a Reaper with a sessionID to identify containers and a provider to use -// Do not call this directly, use reuseOrCreateReaper instead +var createContainerFailDueToNameConflictRegex = regexp.MustCompile("Conflict. The container name .* is already in use by container .*") + +// reuseReaperContainer constructs a Reaper from an already running reaper +// DockerContainer. +func reuseReaperContainer(ctx context.Context, sessionID string, provider ReaperProvider, reaperContainer *DockerContainer) (*Reaper, error) { + endpoint, err := reaperContainer.PortEndpoint(ctx, "8080", "") + if err != nil { + return nil, err + } + return &Reaper{ + Provider: provider, + SessionID: sessionID, + Endpoint: endpoint, + container: reaperContainer, + }, nil +} + +// newReaper creates a Reaper with a sessionID to identify containers and a +// provider to use. Do not call this directly, use reuseOrCreateReaper instead. func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, opts ...ContainerOption) (*Reaper, error) { dockerHostMount := testcontainersdocker.ExtractDockerSocket(ctx) @@ -209,6 +228,7 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o Mounts: Mounts(BindMount(dockerHostMount, "/var/run/docker.sock")), Privileged: tcConfig.RyukPrivileged, WaitingFor: wait.ForListeningPort(listeningPort), + Name: reaperContainerNameFromSessionID(sessionID), ReaperOptions: opts, HostConfigModifier: func(hc *container.HostConfig) { hc.AutoRemove = true @@ -237,6 +257,39 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o c, err := provider.RunContainer(ctx, req) if err != nil { + // We need to check whether the error is caused by a container with the same name + // already existing due to race conditions. We manually match the error message + // as we do not have any error types to check against. + if createContainerFailDueToNameConflictRegex.MatchString(err.Error()) { + // Manually retrieve the already running reaper container. As it may take a while + // to observe the container, despite creation failure, we retry a few times. + const timeout = 5 * time.Second + const cooldown = 100 * time.Millisecond + start := time.Now() + var reaperContainer *DockerContainer + for time.Since(start) < timeout { + reaperContainer, err = lookUpReaperContainer(ctx, sessionID) + if err == nil && reaperContainer != nil { + break + } + select { + case <-ctx.Done(): + case <-time.After(cooldown): + } + } + if err != nil { + return nil, fmt.Errorf("look up reaper container because creation failed due to name conflict: %w", err) + } + if reaperContainer == nil { + return nil, fmt.Errorf("look up reaper container returned nil although creation failed due to name conflict") + } + Logger.Printf("🔥 Canceling creation - Reaper obtained from Docker for this test session %s", reaperContainer.ID) + reaper, err := reuseReaperContainer(ctx, sessionID, provider, reaperContainer) + if err != nil { + return nil, err + } + return reaper, nil + } return nil, err } reaper.container = c diff --git a/reaper_test.go b/reaper_test.go index 1aa361b179..22572b375a 100644 --- a/reaper_test.go +++ b/reaper_test.go @@ -511,3 +511,49 @@ func TestReaper_reuseItFromOtherTestProgramUsingDocker(t *testing.T) { terminateContainerOnEnd(t, ctx, reaper.container) } } + +// TestReaper_ReuseRunning tests whether reusing the reaper if using +// testcontainers from concurrently multiple packages works as expected. In this +// case, global locks are without any effect as Go tests different packages +// isolated. Therefore, this test does not use the same logic with locks on +// purpose. We expect reaper creation to still succeed in case a reaper is +// already running for the same session id by returning its container instance +// instead. +func TestReaper_ReuseRunning(t *testing.T) { + const concurrency = 64 + + timeout, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + sessionID := SessionID() + + dockerProvider, err := NewDockerProvider() + require.NoError(t, err, "new docker provider should not fail") + + obtainedReaperContainerIDs := make([]string, concurrency) + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + reaperContainer, err := lookUpReaperContainer(timeout, sessionID) + if err == nil && reaperContainer != nil { + // Found. + obtainedReaperContainerIDs[i] = reaperContainer.GetContainerID() + return + } + // Not found -> create. + createdReaper, err := newReaper(timeout, sessionID, dockerProvider) + require.NoError(t, err, "new reaper should not fail") + obtainedReaperContainerIDs[i] = createdReaper.container.GetContainerID() + }() + } + wg.Wait() + + // Assure that all calls returned the same container. + firstContainerID := obtainedReaperContainerIDs[0] + for i, containerID := range obtainedReaperContainerIDs { + assert.Equal(t, firstContainerID, containerID, "call %d should have returned same container id", i) + } +} From 109c3ac4c68c5d1bf67526c5aebfc04b88dc8896 Mon Sep 17 00:00:00 2001 From: Lennart Altenhof Date: Mon, 6 Nov 2023 17:32:41 +0100 Subject: [PATCH 2/4] docs(reaper): make reaper recovery log entries the same The log message related to the reaper container in the reaper.go file has been updated for better clarity. The redundant phrase "Canceling creation -" has been removed as it does not provide additional relevant information, aiming to improve log readability. --- reaper.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reaper.go b/reaper.go index 17ff54576a..1c0d3d753a 100644 --- a/reaper.go +++ b/reaper.go @@ -283,7 +283,7 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o if reaperContainer == nil { return nil, fmt.Errorf("look up reaper container returned nil although creation failed due to name conflict") } - Logger.Printf("🔥 Canceling creation - Reaper obtained from Docker for this test session %s", reaperContainer.ID) + Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) reaper, err := reuseReaperContainer(ctx, sessionID, provider, reaperContainer) if err != nil { return nil, err From 90046771d790081a12f14b4987ebf34034e5f836 Mon Sep 17 00:00:00 2001 From: Lennart Altenhof Date: Mon, 6 Nov 2023 17:56:06 +0100 Subject: [PATCH 3/4] docs(reaper): improve documentation for issues during reaper recovery Further comments where added to error handling in the reaper.go file, to better account for possible race conditions. This includes conditions where a container creation fails due to name conflict but no containers are visible in list-requests. Additionally, a possible scenario where the container may have died between requests has been covered. --- reaper.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/reaper.go b/reaper.go index 1c0d3d753a..0bb8b12cbf 100644 --- a/reaper.go +++ b/reaper.go @@ -261,8 +261,13 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o // already existing due to race conditions. We manually match the error message // as we do not have any error types to check against. if createContainerFailDueToNameConflictRegex.MatchString(err.Error()) { - // Manually retrieve the already running reaper container. As it may take a while - // to observe the container, despite creation failure, we retry a few times. + // Manually retrieve the already running reaper container. However, we need to + // use retries here as there are two possible race conditions that might lead to + // errors: In most cases, there is a small delay between container creation and + // actually being visible in list-requests. This means that creation might fail + // due to name conflicts, but when we list containers with this name, we do not + // get any results. In another case, the container might have simply died in the + // meantime and therefore cannot be found. const timeout = 5 * time.Second const cooldown = 100 * time.Millisecond start := time.Now() @@ -280,6 +285,10 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o if err != nil { return nil, fmt.Errorf("look up reaper container because creation failed due to name conflict: %w", err) } + // If the reaper container was not found, it is most likely to have died in + // between as we can exclude any client errors because of the previous error + // check. Because the reaper should only die if it performed clean-ups, we can + // fail here as the reaper timeout needs to be increased, anyway. if reaperContainer == nil { return nil, fmt.Errorf("look up reaper container returned nil although creation failed due to name conflict") } From 181e7bc4ece1cda45c0b17b364d11677427bc6d0 Mon Sep 17 00:00:00 2001 From: Lennart Altenhof Date: Tue, 7 Nov 2023 23:28:11 +0100 Subject: [PATCH 4/4] style(reaper): change error message style in reaper recovery --- reaper.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reaper.go b/reaper.go index 0bb8b12cbf..dda10c281e 100644 --- a/reaper.go +++ b/reaper.go @@ -283,7 +283,7 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o } } if err != nil { - return nil, fmt.Errorf("look up reaper container because creation failed due to name conflict: %w", err) + return nil, fmt.Errorf("look up reaper container due to name conflict failed: %w", err) } // If the reaper container was not found, it is most likely to have died in // between as we can exclude any client errors because of the previous error