Skip to content

Commit

Permalink
Add retries for errors encountered on first setup
Browse files Browse the repository at this point in the history
This addresses the following failures:

The connection to the server 34.77.95.157 was refused - did you specify
the right host or port?

apply files: set default namespaces: discover server resources: unable
to retrieve the complete list of server APIs: metrics.k8s.io/v1beta1:
the server is currently unable to handle the request

b/179011100

Change-Id: Ifc372dcbcf4914ff23ed88b2e3d39f62f45b17fc
GitOrigin-RevId: c02c1ea
  • Loading branch information
drigz authored and copybara-github committed Feb 2, 2021
1 parent 23785fd commit 38e9c21
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 13 deletions.
23 changes: 14 additions & 9 deletions deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,20 @@ function helm_charts {
|| die "create: failed to get cluster credentials"
[[ -n "${CURRENT_CONTEXT}" ]] && kubectl config use-context "${CURRENT_CONTEXT}"


# Wait for the GKE cluster to be reachable.
i=0
until kc get serviceaccount default &>/dev/null; do
sleep 1
i=$((i + 1))
if ((i >= 60)) ; then
# Try again, without suppressing stderr this time.
if ! kc get serviceaccount default >/dev/null; then
die "'kubectl get serviceaccount default' failed"
fi
fi
done

# Generate a certificate authority if none exists yet. It is used by
# cert-manager to issue new cluster-internal certificates.
# Avoid creating a new one on each run as rotation may cause intermittent
Expand Down Expand Up @@ -324,20 +338,11 @@ EOF
kc apply --validate=false -f ${DIR}/third_party/cert-manager/00-crds.yaml
kc label --overwrite namespace default certmanager.k8s.io/disable-validation=true

echo "installing cert-manager to ${KUBE_CONTEXT}..."
# Installation of cert-manager fails if apiservice v1beta1.metrics.k8s.io is not available
# this happens when terraform upgraded the Kubernetes control plane before
kc wait apiservice v1beta1.metrics.k8s.io --for condition=Available --timeout=600s

# cert-manager/templates/webhook-rbac.yaml has hard-coded 'kube-system' ns
${HELM} template -n cert-manager --set global.rbac.create=false ${DIR}/third_party/cert-manager/cert-manager-v0.10.1.tgz \
| ${SYNK} apply cert-manager -n default -f - \
|| die "Synk failed for cert-manager"

# Wait for webhook installation to avoid the error:
# the server is currently unable to handle the request
kc wait deployment cert-manager-webhook --for condition=Available --timeout=600s

echo "installing base-cloud to ${KUBE_CONTEXT}..."
${HELM} template -n base-cloud ${values} \
./bazel-bin/src/app_charts/base/base-cloud-0.0.1.tgz \
Expand Down
1 change: 1 addition & 0 deletions src/go/cmd/synk/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
deps = [
"//src/go/pkg/apis/apps/v1alpha1:go_default_library",
"//src/go/pkg/synk:go_default_library",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_pkg_errors//:go_default_library",
"@com_github_spf13_cobra//:go_default_library",
"@io_k8s_apimachinery//pkg/apis/meta/v1/unstructured:go_default_library",
Expand Down
30 changes: 26 additions & 4 deletions src/go/cmd/synk/synk.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ import (
"fmt"
"os"
"strings"
"time"

"github.com/cenkalti/backoff"
apps "github.com/googlecloudrobotics/core/src/go/pkg/apis/apps/v1alpha1"
"github.com/googlecloudrobotics/core/src/go/pkg/synk"
"github.com/pkg/errors"
Expand All @@ -31,7 +33,13 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"
)

const (
retryBackoff = 5 * time.Second
)

var (
retries uint64

cmdRoot = &cobra.Command{
Use: "synk",
Short: "A tool to sync manifests with a cluster.",
Expand Down Expand Up @@ -60,6 +68,8 @@ func main() {
restOpts.AddFlags(cmdRoot.PersistentFlags())
resourceOpts.AddFlags(cmdApply.PersistentFlags())

cmdApply.PersistentFlags().Uint64Var(&retries, "retries", 60, "max number of retries for transient errors, with a 5 second constant backoff")

cmdRoot.AddCommand(cmdInit)
cmdRoot.AddCommand(cmdApply)
cmdRoot.AddCommand(cmdDelete)
Expand Down Expand Up @@ -167,15 +177,27 @@ func apply(name string) error {
opts := &synk.ApplyOptions{
Namespace: namespace,
EnforceNamespace: enforceNamespace,
Log: log,
}
if _, err := s.Apply(context.Background(), name, opts, resources...); err != nil {
Log: logAction,
}
if err := backoff.Retry(
func() error {
_, err := s.Apply(context.Background(), name, opts, resources...)
if err != nil {
if synk.IsTransientErr(err) {
return err
}
return backoff.Permanent(err)
}
return nil
},
backoff.WithMaxRetries(backoff.NewConstantBackOff(retryBackoff), retries),
); err != nil {
return errors.Wrap(err, "apply files")
}
return nil
}

func log(r *unstructured.Unstructured, action apps.ResourceAction, status, msg string) {
func logAction(r *unstructured.Unstructured, action apps.ResourceAction, status, msg string) {
// Remove some visual clutter by only showing the resource for successes.
if status == synk.StatusSuccess {
fmt.Fprintf(os.Stderr, "[%s] %s %s/%s %s/%s\n",
Expand Down
2 changes: 2 additions & 0 deletions src/go/pkg/synk/synk.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ func IsTransientErr(err error) bool {
case k8serrors.IsTimeout(err):
case k8serrors.IsTooManyRequests(err):
case k8serrors.IsServiceUnavailable(err):
// May happen shortly after CRD creation.
case discovery.IsGroupDiscoveryFailedError(err):
// May happen if a chart is deleted and immediately recreated.
// https://github.com/kubernetes/kubernetes/blob/d2a081c8e14e21e28fe5bdfa38a817ef9c0bb8e3/staging/src/k8s.io/apiserver/pkg/admission/plugin/namespace/lifecycle/admission.go#L173
case strings.Contains(err.Error(), "unable to create new content in namespace"):
Expand Down

0 comments on commit 38e9c21

Please sign in to comment.