diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8e1cd40bd4..e64966a1e7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -228,7 +228,7 @@ jobs: - name: Build test dependencies env: DOCKER_BUILDX_OUTPUT: type=docker - run: make docker-build-osm build-osm docker-build-tcp-echo-server docker-build-retry + run: make docker-build-osm build-osm docker-build-tcp-echo-server # PR Tests - name: Run PR tests id: pr_test diff --git a/Makefile b/Makefile index 92ee2a74e3..242b281938 100644 --- a/Makefile +++ b/Makefile @@ -128,7 +128,7 @@ kind-reset: .PHONY: test-e2e test-e2e: DOCKER_BUILDX_OUTPUT=type=docker -test-e2e: docker-build-osm build-osm docker-build-tcp-echo-server docker-build-retry +test-e2e: docker-build-osm build-osm docker-build-tcp-echo-server go test ./tests/e2e $(E2E_FLAGS_DEFAULT) $(E2E_FLAGS) .env: @@ -143,7 +143,7 @@ kind-demo: .env kind-up clean-osm build-bookwatcher: GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o ./demo/bin/bookwatcher/bookwatcher ./demo/cmd/bookwatcher -DEMO_TARGETS = bookbuyer bookthief bookstore bookwarehouse tcp-echo-server tcp-client retry +DEMO_TARGETS = bookbuyer bookthief bookstore bookwarehouse tcp-echo-server tcp-client # docker-build-bookbuyer, etc DOCKER_DEMO_TARGETS = $(addprefix docker-build-, $(DEMO_TARGETS)) .PHONY: $(DOCKER_DEMO_TARGETS) diff --git a/demo/cmd/retry/retry.go b/demo/cmd/retry/retry.go deleted file mode 100644 index 2f1474b8e2..0000000000 --- a/demo/cmd/retry/retry.go +++ /dev/null @@ -1,40 +0,0 @@ -package main - -import ( - _ "embed" - "fmt" - "net/http" - "sync" - - "github.com/openservicemesh/osm/pkg/logger" - "github.com/openservicemesh/osm/tests/e2e" -) - -var log = logger.NewPretty("retry") - -var mu sync.Mutex -var httpRequests uint32 - -const retryOn = 555 - -func retryHandler(w http.ResponseWriter, r *http.Request) { - mu.Lock() - // Count number of http requests received - httpRequests++ - mu.Unlock() - - // Return status code that causes retry policy - if httpRequests <= e2e.NumRetries { - w.WriteHeader(retryOn) - } - _, err := w.Write([]byte(fmt.Sprintf("RequestCount:%v\n", httpRequests))) - if err != nil { - log.Error().Err(err).Msgf("Couldn't write number of requests recevied") - } -} - -func main() { - http.HandleFunc("/", retryHandler) - err := http.ListenAndServe(":9091", nil) - log.Fatal().Err(err).Msgf("Failed to start HTTP server on port 9091") -} diff --git a/tests/e2e/constants.go b/tests/e2e/constants.go index 2bbdb18cea..862b2db471 100644 --- a/tests/e2e/constants.go +++ b/tests/e2e/constants.go @@ -14,6 +14,4 @@ const ( var ( fortioSingleCallSpec = framework.FortioLoadTestSpec{Calls: 1} - // NumRetries is the number of retries for retry e2e - NumRetries uint32 = 5 ) diff --git a/tests/e2e/e2e_retry_policy_test.go b/tests/e2e/e2e_retry_policy_test.go index 3d3376d0c9..8b3d421c22 100644 --- a/tests/e2e/e2e_retry_policy_test.go +++ b/tests/e2e/e2e_retry_policy_test.go @@ -1,8 +1,12 @@ package e2e import ( + "bufio" "context" "fmt" + "path/filepath" + "regexp" + "strings" "time" . "github.com/onsi/ginkgo" @@ -18,6 +22,9 @@ const server = "server" var meshNs = []string{client, server} +var retryStats = map[string]string{"upstream_rq_retry": "", "upstream_rq_retry_limit_exceeded": "", "upstream_rq_retry_backoff_exponential": ""} +var thresholdUintVal uint32 = 5 + var _ = OSMDescribe("Test Retry Policy", OSMDescribeInfo{ Tier: 2, @@ -25,7 +32,7 @@ var _ = OSMDescribe("Test Retry Policy", }, func() { Context("Retry policy enabled", func() { - It("tests retryOn and numRetries field for retry policy", + It("tests retry policy", func() { // Install OSM installOpts := Td.GetOSMInstallOpts() @@ -39,19 +46,14 @@ var _ = OSMDescribe("Test Retry Policy", Expect(Td.AddNsToMesh(true, n)).To(Succeed()) } - // Load retry image - retryImage := fmt.Sprintf("%s/retry:%s", installOpts.ContainerRegistryLoc, installOpts.OsmImagetag) - Expect(Td.LoadImagesToKind([]string{"retry"})).To(Succeed()) - + // Get simple pod definitions for the HTTP server svcAccDef, podDef, svcDef, err := Td.SimplePodApp( SimplePodAppDef{ - PodName: server, - Namespace: server, - ServiceAccountName: server, - Command: []string{"/retry"}, - Image: retryImage, - Ports: []int{9091}, - OS: Td.ClusterOS, + PodName: server, + Namespace: server, + Image: "kennethreitz/httpbin", + Ports: []int{80}, + OS: Td.ClusterOS, }) Expect(err).NotTo(HaveOccurred()) @@ -105,7 +107,7 @@ var _ = OSMDescribe("Test Retry Policy", RetryPolicy: v1alpha1.RetryPolicySpec{ RetryOn: "5xx", PerTryTimeout: &metav1.Duration{Duration: time.Duration(1 * time.Second)}, - NumRetries: &NumRetries, + NumRetries: &thresholdUintVal, RetryBackoffBaseInterval: &metav1.Duration{Duration: time.Duration(5 * time.Second)}, }, }, @@ -117,22 +119,30 @@ var _ = OSMDescribe("Test Retry Policy", SourceNs: client, SourcePod: clientPod.Name, SourceContainer: podDef.GetName(), - Destination: fmt.Sprintf("%s.%s.svc.cluster.local:9091", serverSvc.Name, server), + Destination: fmt.Sprintf("%s.%s.svc.cluster.local:80/status/503", serverSvc.Name, server), } By("A request that will be retried NumRetries times then succeed") - // wait for server time.Sleep(3 * time.Second) - result := Td.RetryHTTPRequest(req) - // One count is the initial http request that returns a retriable status code - // followed by numRetries retries - Expect(result.RequestCount).To(Equal(int(NumRetries) + 1)) - Expect(result.StatusCode).To(Equal(200)) - Expect(result.Err).To(BeNil()) + result := Td.HTTPRequest(req) + stdout, _, err := Td.RunLocal(filepath.FromSlash("../../bin/osm"), "proxy", "get", "stats", clientPod.Name, "--namespace", client) + Expect(err).ToNot((HaveOccurred())) + + metrics, err := findRetryStats(stdout.String(), serverSvc.Name+"|80", retryStats) + Expect(err).ToNot((HaveOccurred())) + + // upstream_rq_retry: Total request retries + Expect(metrics["upstream_rq_retry"]).To(Equal("5")) + // upstream_rq_retry_limit_exceeded: Total requests not retried because max retries reached + Expect(metrics["upstream_rq_retry_limit_exceeded"]).To(Equal("1")) + // upstream_rq_retry_backoff_exponential: Total retries using the exponential backoff strategy + Expect(metrics["upstream_rq_retry_backoff_exponential"]).To(Equal("5")) + + Expect(result.StatusCode).To(Equal(503)) }) }) Context("Retry policy disabled", func() { - It("tests retry does not occur", + It("tests retry policy", func() { // Install OSM installOpts := Td.GetOSMInstallOpts() @@ -145,19 +155,15 @@ var _ = OSMDescribe("Test Retry Policy", Expect(Td.CreateNs(n, nil)).To(Succeed()) Expect(Td.AddNsToMesh(true, n)).To(Succeed()) } - // Load retry image - retryImage := fmt.Sprintf("%s/retry:%s", installOpts.ContainerRegistryLoc, installOpts.OsmImagetag) - Expect(Td.LoadImagesToKind([]string{"retry"})).To(Succeed()) + // Get simple pod definitions for the HTTP server svcAccDef, podDef, svcDef, err := Td.SimplePodApp( SimplePodAppDef{ - PodName: server, - Namespace: server, - ServiceAccountName: server, - Command: []string{"/retry"}, - Image: retryImage, - Ports: []int{9091}, - OS: Td.ClusterOS, + PodName: server, + Namespace: server, + Image: "kennethreitz/httpbin", + Ports: []int{80}, + OS: Td.ClusterOS, }) Expect(err).NotTo(HaveOccurred()) @@ -211,7 +217,7 @@ var _ = OSMDescribe("Test Retry Policy", RetryPolicy: v1alpha1.RetryPolicySpec{ RetryOn: "5xx", PerTryTimeout: &metav1.Duration{Duration: time.Duration(1 * time.Second)}, - NumRetries: &NumRetries, + NumRetries: &thresholdUintVal, RetryBackoffBaseInterval: &metav1.Duration{Duration: time.Duration(5 * time.Second)}, }, }, @@ -223,17 +229,53 @@ var _ = OSMDescribe("Test Retry Policy", SourceNs: client, SourcePod: clientPod.Name, SourceContainer: podDef.GetName(), - Destination: fmt.Sprintf("%s.%s.svc.cluster.local:9091", serverSvc.Name, server), + Destination: fmt.Sprintf("%s.%s.svc.cluster.local:80/status/503", serverSvc.Name, server), } - By("A request that will not be retried on") - // wait for server + By("A request that will be retried NumRetries times then succeed") time.Sleep(3 * time.Second) - result := Td.RetryHTTPRequest(req) - // One count is the initial http request that is not retried on - Expect(result.RequestCount).To(Equal(1)) - Expect(result.StatusCode).To(Equal(555)) - Expect(result.Err).To(BeNil()) + result := Td.HTTPRequest(req) + stdout, _, err := Td.RunLocal(filepath.FromSlash("../../bin/osm"), "proxy", "get", "stats", clientPod.Name, "--namespace", client) + Expect(err).ToNot((HaveOccurred())) + + metrics, err := findRetryStats(stdout.String(), serverSvc.Name+"|80", retryStats) + Expect(err).ToNot((HaveOccurred())) + + // upstream_rq_retry: Total request retries + Expect(metrics["upstream_rq_retry"]).To(Equal("0")) + // upstream_rq_retry_limit_exceeded: Total requests not retried because max retries reached + Expect(metrics["upstream_rq_retry_limit_exceeded"]).To(Equal("0")) + // upstream_rq_retry_backoff_exponential: Total retries using the exponential backoff strategy + Expect(metrics["upstream_rq_retry_backoff_exponential"]).To(Equal("0")) + + Expect(result.StatusCode).To(Equal(503)) }) }) + }) + +func findRetryStats(output, serverSvc string, retryStats map[string]string) (map[string]string, error) { + scanner := bufio.NewScanner(strings.NewReader(output)) + for scanner.Scan() { + stat := scanner.Text() + if strings.Contains(stat, serverSvc) { + retryStats = getMetric(stat, retryStats) + } + } + + err := scanner.Err() + return retryStats, err +} + +func getMetric(stat string, retryStats map[string]string) map[string]string { + for r := range retryStats { + regR := r + "\\b" + match, _ := regexp.MatchString(regR, stat) + if match { + splitStat := strings.Split(stat, ":") + res := strings.ReplaceAll(splitStat[1], " ", "") + retryStats[r] = res + } + } + return retryStats +} diff --git a/tests/framework/common_traffic.go b/tests/framework/common_traffic.go index 2b43644017..976dc3446b 100644 --- a/tests/framework/common_traffic.go +++ b/tests/framework/common_traffic.go @@ -74,13 +74,6 @@ type HTTPRequestResult struct { Err error } -// RetryRequestResult represents the result of a HTTPRequest call for retry -type RetryRequestResult struct { - RequestCount int - StatusCode int - Err error -} - // TCPRequestResult represents the result of a TCPRequest call type TCPRequestResult struct { Response string @@ -139,60 +132,6 @@ func (td *OsmTestData) HTTPRequest(ht HTTPRequestDef) HTTPRequestResult { } } -// RetryHTTPRequest runs a synchronous call to run the HTTPRequestDef and returns a RetryRequestResult -func (td *OsmTestData) RetryHTTPRequest(ht HTTPRequestDef) RetryRequestResult { - // -s silent progress, -L follow redirects - var commandStr string - if td.ClusterOS == constants.OSWindows { - commandStr = fmt.Sprintf("curl.exe -s -w %s:%%{http_code} -L %s", StatusCodeWord, ht.Destination) - } else { - commandStr = fmt.Sprintf("/usr/bin/curl -s -w %s:%%{http_code} -L %s", StatusCodeWord, ht.Destination) - } - command := strings.Fields(commandStr) - - stdout, stderr, err := td.RunRemote(ht.SourceNs, ht.SourcePod, ht.SourceContainer, command) - if err != nil { - // Error codes from the execution come through err - // Curl 'Connection refused' err code = 7 - return RetryRequestResult{ - 0, - 0, - fmt.Errorf("Remote exec err: %v | stderr: %s", err, stderr), - } - } - if len(stderr) > 0 { - // no error from execution and proper exit code, we got some stderr though - td.T.Logf("[warn] Stderr: %v", stderr) - } - split := strings.Split(stdout, "\n") - var fields [][]string - for _, s := range split { - fields = append(fields, strings.Split(s, ":")) - } - rqCount, err := strconv.Atoi(fields[0][1]) - if err != nil { - return RetryRequestResult{ - 0, - 0, - fmt.Errorf("Could not read request count as integer: %v", err), - } - } - statusCode, err := strconv.Atoi(fields[1][1]) - if err != nil { - return RetryRequestResult{ - 0, - 0, - fmt.Errorf("Could not read status code as integer: %v", err), - } - } - - return RetryRequestResult{ - rqCount, - statusCode, - nil, - } -} - // TCPRequest runs a synchronous TCP request to run the TCPRequestDef and return a TCPRequestResult func (td *OsmTestData) TCPRequest(req TCPRequestDef) TCPRequestResult { var command []string