canonical · tomponline · Dec 5, 2023 · Nov 28, 2023 · tomponline · Nov 29, 2023
diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
@@ -24,6 +24,11 @@ import (
 	"github.com/canonical/lxd/shared/version"
 )
 
+// clusterBusyError is returned by dqlite if attempting attempting to join a cluster at the same time as a role-change.
+// This error tells us we can retry and probably join the cluster or fail due to something else.
+// The error code here is SQLITE_BUSY.
+var clusterBusyError = fmt.Errorf("a configuration change is already in progress (5)")
 // IsRetriableError returns true if the given error might be transient and the 
 // interaction can be safely retried. 
 func IsRetriableError(err error) bool { 
 	var dErr *driver.Error 
 	if errors.As(err, &dErr) && dErr.Code == driver.ErrBusy { 
 		return true 
 	} 
 	if errors.Is(err, sqlite3.ErrLocked) || errors.Is(err, sqlite3.ErrBusy) { 
 		return true 
 	} 
 	// Unwrap errors one at a time. 
 	for ; err != nil; err = errors.Unwrap(err) { 
 		if strings.Contains(err.Error(), "database is locked") { 
 			return true 
 		} 
 		if strings.Contains(err.Error(), "cannot start a transaction within a transaction") { 
 			return true 
 		} 
 		if strings.Contains(err.Error(), "bad connection") { 
 			return true 
 		} 
 		if strings.Contains(err.Error(), "checkpoint in progress") { 
 			return true 
 		} 
 	} 
 	return false 
 } 
 // IsRetriableError returns true if the given error might be transient and the 
 // interaction can be safely retried. 
 func IsRetriableError(err error) bool { 
 	var dErr *driver.Error 
  
 	if errors.As(err, &dErr) && dErr.Code == driver.ErrBusy { 
 		return true 
 	} 
  
 	if errors.Is(err, sqlite3.ErrLocked) || errors.Is(err, sqlite3.ErrBusy) { 
 		return true 
 	} 
  
 	// Unwrap errors one at a time. 
 	for ; err != nil; err = errors.Unwrap(err) { 
 		if strings.Contains(err.Error(), "database is locked") { 
 			return true 
 		} 
  
 		if strings.Contains(err.Error(), "cannot start a transaction within a transaction") { 
 			return true 
 		} 
  
 		if strings.Contains(err.Error(), "bad connection") { 
 			return true 
 		} 
  
 		if strings.Contains(err.Error(), "checkpoint in progress") { 
 			return true 
 		} 
 	} 
  
 	return false 
 } 
+
 // Bootstrap turns a non-clustered LXD instance into the first (and leader)
 // node of a new LXD cluster.
 //
@@ -431,9 +436,27 @@ func Join(state *state.State, gateway *Gateway, networkCert *shared.CertInfo, se
 	logger.Info("Adding node to cluster", logger.Ctx{"id": info.ID, "local": info.Address, "role": info.Role})
 	ctx, cancel = context.WithTimeout(context.Background(), time.Minute)
 	defer cancel()
-	err = client.Add(ctx, info.NodeInfo)
-	if err != nil {
-		return fmt.Errorf("Failed to join cluster: %w", err)
+
+	// Repeatedly try to join in case the cluster is busy with a role-change.
+	joined := false
+	for !joined {
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("Failed to join cluster: %w", ctx.Err())
+		default:
+			err = client.Add(ctx, info.NodeInfo)
+			if err != nil && err.Error() == clusterBusyError.Error() {
+				// If the cluster is busy with a role change, sleep a second and then keep trying to join.
+				time.Sleep(1 * time.Second)
+				continue
+			}
+
+			if err != nil {
+				return fmt.Errorf("Failed to join cluster: %w", err)
+			}
+
+			joined = true
+		}
 	}
 
 	// Make sure we can actually connect to the cluster database through