From d2a62499de2aa9dbed7504952dbf550ea3bb34bc Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 09:55:11 +0200 Subject: [PATCH 01/10] Investigate errors --- .buildkite/pipeline.yml | 102 ++-------------------------------------- 1 file changed, 3 insertions(+), 99 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a8e9ec4428..60ff42a3df 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -11,81 +11,6 @@ steps: - exit_status: -1 # Agent was lost - exit_status: 255 # Forced agent shutdown timeout_in_minutes: 10 - - label: ":bazel: Go tests" - if: build.message !~ /\[doc\]/ - command: - - bazel test //go/... --print_relative_test_log_paths - key: go_tests - artifact_paths: - - "artifacts.out/**/*" - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - timeout_in_minutes: 10 - - label: ":python: Python tests" - if: build.message !~ /\[doc\]/ - command: ./scion.sh test py - key: py_tests - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - - label: "Check licenses" - command: - - mkdir -p /tmp/test-artifacts/licenses - - ./tools/licenses.sh /tmp/test-artifacts/licenses $BUILDKITE_PIPELINE_SLUG - - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data - key: licenses - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - - label: "Check generated go_deps.bzl file is up to date with go.mod" - if: build.message !~ /\[doc\]/ - command: - - mkdir -p /tmp/test-artifacts - - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/ - - make godeps -B - - bazel-${BUILDKITE_PIPELINE_SLUG}/external/go_sdk/bin/go mod tidy - - diff -u /tmp/test-artifacts/go.mod go.mod - - diff -u /tmp/test-artifacts/go.sum go.sum - - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl - key: go_deps_lint - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - - label: "Check generated go/proto files in git" - if: build.message !~ /\[doc\]/ - command: - - mkdir -p /tmp/test-artifacts - - cp -R go/proto/ /tmp/test-artifacts/ - - make gogen - - diff -ur /tmp/test-artifacts/proto/ go/proto/ - key: go_gen_lint - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - - label: "Lint" - command: ./scion.sh lint - key: lint - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - - label: ":bazel: Integration tests" - if: build.message !~ /\[doc\]/ - command: - - bazel test //integration/... - key: integration_tests - artifact_paths: - - "artifacts.out/**/*" - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - label: "Revocation tests" if: build.message !~ /\[doc\]/ command: @@ -103,6 +28,7 @@ steps: echo "--- Shutting down SCION topology" ./scion.sh stop echo "SCION topology successfully shut down" + parallelism: 10 artifact_paths: - "artifacts.out/**/*" retry: @@ -128,6 +54,7 @@ steps: echo "--- Shutting down SCION topology" ./scion.sh stop echo "SCION topology successfully shut down" + parallelism: 10 artifact_paths: - "artifacts.out/**/*" timeout_in_minutes: 10 @@ -136,30 +63,6 @@ steps: automatic: - exit_status: -1 # Agent was lost - exit_status: 255 # Forced agent shutdown - - label: ":docker: Integration: end2end_integration" - command: - - ./scion.sh topology -t -d -c topology/Tiny.topo - - ./scion.sh run - - docker-compose -f gen/scion-dc.yml -p scion up -d $(docker-compose -f gen/scion-dc.yml config --services | grep tester) - - sleep 10 - # - ./bin/cert_req_integration -d -log.console warn - - ./bin/pp_integration -d -log.console warn - - ./bin/scmp_integration -d -log.console warn - - ./bin/end2end_integration -d -log.console warn - plugins: - - scionproto/metahook#v0.3.0: - post-command: | - echo "--- Shutting down SCION topology" - ./scion.sh stop - echo "SCION topology successfully shut down" - artifact_paths: - - "artifacts.out/**/*" - timeout_in_minutes: 10 - key: docker_integration_tests - retry: - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - label: ":docker: Integration: end2end_integration full topology" if: build.message !~ /\[doc\]/ command: @@ -174,6 +77,7 @@ steps: echo "--- Shutting down SCION topology" ./scion.sh stop echo "SCION topology successfully shut down" + parallelism: 10 artifact_paths: - "artifacts.out/**/*" timeout_in_minutes: 10 From a6b74d5154722e3637be235448f1264b0a070f21 Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 10:16:00 +0200 Subject: [PATCH 02/10] drop acceptance --- .buildkite/pipeline.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/pipeline.sh b/.buildkite/pipeline.sh index 058729ec4d..7ec7c127f0 100755 --- a/.buildkite/pipeline.sh +++ b/.buildkite/pipeline.sh @@ -5,5 +5,3 @@ set -eou pipefail . ./.buildkite/pipeline_lib.sh cat .buildkite/pipeline.yml -gen_bazel_test_steps //acceptance -gen_acceptance ./acceptance From 656bfe8f266a1e820a081087dfd02c608101105b Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 14:14:40 +0200 Subject: [PATCH 03/10] update quic-go --- go.mod | 2 +- go.sum | 2 ++ go_deps.bzl | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 30965ea4c8..3ec17d09a1 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/iancoleman/strcase v0.0.0-20190422225806-e506e3ef7365 github.com/inconshreveable/log15 v0.0.0-20161013181240-944cbfb97b44 github.com/kormat/fmt15 v0.0.0-20181112140556-ee69fecb2656 - github.com/lucas-clemente/quic-go v0.15.5 + github.com/lucas-clemente/quic-go v0.15.7 github.com/mattn/go-colorable v0.1.4 // indirect github.com/mattn/go-isatty v0.0.8 github.com/mattn/go-sqlite3 v1.9.1-0.20180719091609-b3511bfdd742 diff --git a/go.sum b/go.sum index 8e97b4beef..c25dbdf432 100644 --- a/go.sum +++ b/go.sum @@ -144,6 +144,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lucas-clemente/quic-go v0.15.5 h1:2DR5qCVt4k1qnDmknf+idj1FDJIrkiDyS6a0uIR+UJY= github.com/lucas-clemente/quic-go v0.15.5/go.mod h1:Myi1OyS0FOjL3not4BxT7KN29bRkcMUV5JVVFLKtDp8= +github.com/lucas-clemente/quic-go v0.15.7 h1:Pu7To5/G9JoP1mwlrcIvfV8ByPBlCzif3MCl8+1W83I= +github.com/lucas-clemente/quic-go v0.15.7/go.mod h1:Myi1OyS0FOjL3not4BxT7KN29bRkcMUV5JVVFLKtDp8= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/mailru/easyjson v0.0.0-20190312143242-1de009706dbe/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= diff --git a/go_deps.bzl b/go_deps.bzl index 2737bdf815..dac8b27bcf 100644 --- a/go_deps.bzl +++ b/go_deps.bzl @@ -479,8 +479,8 @@ def go_deps(): go_repository( name = "com_github_lucas_clemente_quic_go", importpath = "github.com/lucas-clemente/quic-go", - sum = "h1:2DR5qCVt4k1qnDmknf+idj1FDJIrkiDyS6a0uIR+UJY=", - version = "v0.15.5", + sum = "h1:Pu7To5/G9JoP1mwlrcIvfV8ByPBlCzif3MCl8+1W83I=", + version = "v0.15.7", ) go_repository( name = "com_github_lunixbochs_vtclean", From b776a1db73d309f052bf823536038272229bcf10 Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 15:28:34 +0200 Subject: [PATCH 04/10] enable logging --- python/topology/supervisor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/topology/supervisor.py b/python/topology/supervisor.py index 42135eeb08..48f546a759 100755 --- a/python/topology/supervisor.py +++ b/python/topology/supervisor.py @@ -121,7 +121,7 @@ def _common_entry(self, name, cmd_args, elem_dir=None): entry = { 'autostart': 'false', 'autorestart': 'false', - 'environment': 'TZ=UTC', + 'environment': 'TZ=UTC,QUIC_GO_LOG_LEVEL=debug', 'stdout_logfile': "NONE", 'stderr_logfile': "NONE", 'startretries': 0, From 1743b04ef4074bfe7921dae5267dfc610e5fda3d Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 15:56:44 +0200 Subject: [PATCH 05/10] fast accept --- go/lib/infra/messenger/tcp/messenger.go | 14 +++++++------- go/lib/infra/rpc/rpc.go | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/go/lib/infra/messenger/tcp/messenger.go b/go/lib/infra/messenger/tcp/messenger.go index 950038e0ae..ec7881daac 100644 --- a/go/lib/infra/messenger/tcp/messenger.go +++ b/go/lib/infra/messenger/tcp/messenger.go @@ -203,9 +203,12 @@ func (m *Messenger) ListenAndServe() { log.Error("[tcp-msgr] Listen error", "err", err) return } - if err := m.handleConn(conn); err != nil { - log.Warn("[tcp-msgr] Server handler exited with error", "err", err) - } + go func() { + defer log.HandlePanic() + if err := m.handleConn(conn); err != nil { + log.Warn("[tcp-msgr] Server handler exited with error", "err", err) + } + }() } } @@ -223,10 +226,7 @@ func (m *Messenger) handleConn(conn net.Conn) error { Message: msg, Address: conn.RemoteAddr(), } - go func() { - defer log.HandlePanic() - m.Handler.ServeRPC(rw, request) - }() + m.Handler.ServeRPC(rw, request) return nil } diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 37d9637358..632a0b4cb7 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -73,9 +73,12 @@ func (s *Server) ListenAndServe() error { log.Warn("[quic] server accept error", "err", err) continue } - if err := s.handleQUICSession(session); err != nil { - log.Warn("[quic] server handler exited with error", "err", err) - } + go func() { + defer log.HandlePanic() + if err := s.handleQUICSession(session); err != nil { + log.Warn("[quic] server handler exited with error", "err", err) + } + }() } } @@ -120,10 +123,7 @@ func (s *Server) handleQUICSession(session quic.Session) error { Message: msg, Address: session.RemoteAddr(), } - go func() { - defer log.HandlePanic() - s.Handler.ServeRPC(rw, request) - }() + s.Handler.ServeRPC(rw, request) return nil } From 2bcb352428ddd6b2eeb06423b374cda7c8170302 Mon Sep 17 00:00:00 2001 From: roos Date: Thu, 7 May 2020 17:20:59 +0200 Subject: [PATCH 06/10] exponential backoff --- go/lib/infra/rpc/rpc.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 632a0b4cb7..0f3eb62ea9 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -23,6 +23,7 @@ import ( "net" "strings" "sync" + "time" quic "github.com/lucas-clemente/quic-go" capnp "zombiezen.com/go/capnproto2" @@ -143,10 +144,19 @@ type Client struct { func (c *Client) Request(ctx context.Context, request *Request, address net.Addr) (*Reply, error) { addressStr := computeAddressStr(address) - session, err := quic.DialContext(ctx, c.Conn, address, addressStr, - c.TLSConfig, c.QUICConfig) - if err != nil { - return nil, err + var session quic.Session + for sleep := 2 * time.Millisecond; ctx.Err() == nil; sleep = sleep * 2 { + var err error + session, err = quic.DialContext(ctx, c.Conn, address, addressStr, c.TLSConfig, c.QUICConfig) + if err == nil { + break + } + log.FromCtx(ctx).Info("Received error", "err", err) + if err.Error() != "SERVER_BUSY" { + return nil, err + } + log.FromCtx(ctx).Info("Exponential back-off", "sleep", sleep) + time.Sleep(sleep) } stream, err := session.OpenStream() From 190cd5e78b002be830ca3458f723f1eafe0b63ac Mon Sep 17 00:00:00 2001 From: roos Date: Fri, 8 May 2020 13:59:17 +0200 Subject: [PATCH 07/10] respect context --- go/lib/infra/rpc/rpc.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 0f3eb62ea9..60f6729e27 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -151,12 +151,13 @@ func (c *Client) Request(ctx context.Context, request *Request, address net.Addr if err == nil { break } - log.FromCtx(ctx).Info("Received error", "err", err) if err.Error() != "SERVER_BUSY" { return nil, err } - log.FromCtx(ctx).Info("Exponential back-off", "sleep", sleep) - time.Sleep(sleep) + select { + case <-time.After(sleep): + case <-ctx.Done(): + } } stream, err := session.OpenStream() From ac2d78de790efe533d7e2a05e34fe65cd5a4cc12 Mon Sep 17 00:00:00 2001 From: roos Date: Fri, 8 May 2020 14:18:18 +0200 Subject: [PATCH 08/10] log --- go/lib/infra/rpc/rpc.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 60f6729e27..0bbf584907 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -154,10 +154,12 @@ func (c *Client) Request(ctx context.Context, request *Request, address net.Addr if err.Error() != "SERVER_BUSY" { return nil, err } + log.FromCtx(ctx).Info("SERVER BUSY") select { case <-time.After(sleep): case <-ctx.Done(): } + log.FromCtx(ctx).Info("SERVER BUSY: Done sleep") } stream, err := session.OpenStream() From ea86378eedb1c89a9822a8c19af254ecedcbef4b Mon Sep 17 00:00:00 2001 From: roos Date: Fri, 8 May 2020 14:36:40 +0200 Subject: [PATCH 09/10] jitter --- go/lib/infra/rpc/rpc.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 0bbf584907..2507c91be9 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -20,6 +20,7 @@ import ( "crypto/tls" "fmt" "io" + mrand "math/rand" "net" "strings" "sync" @@ -154,12 +155,10 @@ func (c *Client) Request(ctx context.Context, request *Request, address net.Addr if err.Error() != "SERVER_BUSY" { return nil, err } - log.FromCtx(ctx).Info("SERVER BUSY") select { - case <-time.After(sleep): + case <-time.After(sleep + time.Duration(mrand.Int()%5)*time.Millisecond): case <-ctx.Done(): } - log.FromCtx(ctx).Info("SERVER BUSY: Done sleep") } stream, err := session.OpenStream() From cb2174a4df7ca0068a623a75d689411570e8606a Mon Sep 17 00:00:00 2001 From: roos Date: Fri, 8 May 2020 14:54:37 +0200 Subject: [PATCH 10/10] more granular --- go/lib/infra/rpc/rpc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/lib/infra/rpc/rpc.go b/go/lib/infra/rpc/rpc.go index 2507c91be9..70d3c04a50 100644 --- a/go/lib/infra/rpc/rpc.go +++ b/go/lib/infra/rpc/rpc.go @@ -156,7 +156,7 @@ func (c *Client) Request(ctx context.Context, request *Request, address net.Addr return nil, err } select { - case <-time.After(sleep + time.Duration(mrand.Int()%5)*time.Millisecond): + case <-time.After(sleep + time.Duration(mrand.Int()%5000)*time.Microsecond): case <-ctx.Done(): } }