diff --git a/Dockerfile b/Dockerfile index 2f430a21be9..f2af984cace 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,7 @@ ENV CADENCE_RELEASE_VERSION=$RELEASE_VERSION # bypass codegen, use committed files. must be run separately, before building things. RUN make .fake-codegen -RUN CGO_ENABLED=0 make copyright cadence-cassandra-tool cadence-sql-tool cadence cadence-server +RUN CGO_ENABLED=0 make copyright cadence-cassandra-tool cadence-sql-tool cadence cadence-server cadence-bench cadence-canary # Download dockerize @@ -109,6 +109,21 @@ COPY --from=builder /cadence/cadence /usr/local/bin ENTRYPOINT ["cadence"] +# Cadence Canary +FROM alpine AS cadence-canary + +COPY --from=builder /cadence/cadence-canary /usr/local/bin +COPY --from=builder /cadence/cadence /usr/local/bin + +CMD ["/usr/local/bin/cadence-canary", "--root", "/etc/cadence-canary", "start"] + +# Cadence Bench +FROM alpine AS cadence-bench + +COPY --from=builder /cadence/cadence-bench /usr/local/bin +COPY --from=builder /cadence/cadence /usr/local/bin + +CMD ["/usr/local/bin/cadence-bench", "--root", "/etc/cadence-bench", "start"] # Final image FROM cadence-${TARGET} diff --git a/bench/README.md b/bench/README.md index 537354d2cb9..ee07ef713ea 100644 --- a/bench/README.md +++ b/bench/README.md @@ -6,7 +6,10 @@ This README describes how to set up Cadence bench, different types of bench load Setup ----------- ### Cadence server -Bench tests requires Cadence server with ElasticSearch. You can run it through: + +Basic bench test don't require Advanced Visibility. + +Other advanced bench tests requires Cadence server with Advanced Visibility. You can run it through: - Docker: Instructions for running Cadence server through docker can be found in `docker/README.md`. Either `docker-compose-es-v7.yml` or `docker-compose-es.yml` can be used to start the server. - Build from source: Please check [CONTRIBUTING](/CONTRIBUTING.md) for how to build and run Cadence server from source. Please also make sure Kafka and ElasticSearch are running before starting the server with `./cadence-server --zone es start`. If ElasticSearch v7 is used, change the value for `--zone` flag to `es_v7`. @@ -56,6 +59,41 @@ This section briefly describes the purpose of each bench load and provides a sam Please note that all load configurations in `config/bench` is for only local development and illustration purpose, it does not reflect the actual capability of Cadence server. +### Basic +This is the only bench test that don't require advanced visibility. + +As the name suggests, this load tests the basic case of starting workflows and running activities in sequential/parallel. Once all test workflows are started, it will wait test workflow timeout + 5 mins before checking the status of all test workflows. If the failure rate is too high, or if there's any open workflows found, the test will fail. + +The basic load can also be run in "panic" mode by setting `"panicStressWorkflow": true,` to test if server can handle large number of panic workflows (which can be caused by a bad worker deployment). + +Sample configuration can be found in `config/bench/basic.json` and `config/ben/basic_panic.json`. To start the test, a sample command can be +``` +cadence --do wf start --tl cadence-bench-tl-0 --wt basic-load-test-workflow --dt 30 --et 3600 --if config/bench/basic.json +``` + +`` needs to be one of the domains in bench config (by default ./config/bench/development.yaml), e.g. `cadence-bench`. + +Then wait for the bench test result. +``` +$cadence --do cadence-bench wf ob -w a2813321-a1bd-40c6-934f-88ad0ded6037 +Progress: + 1, 2021-08-20T11:49:14-07:00, WorkflowExecutionStarted + 2, 2021-08-20T11:49:14-07:00, DecisionTaskScheduled +... +... + 20, 2021-08-20T11:59:24-07:00, DecisionTaskStarted + 21, 2021-08-20T11:59:24-07:00, DecisionTaskCompleted + 22, 2021-08-20T11:59:24-07:00, WorkflowExecutionCompleted + +Result: + Run Time: 526 seconds + Status: COMPLETED + Output: "SuccessCount: 100, FailedCount: 0" +``` +The test will return error if the test doesn't pass. There are two cases: +* The stress workflow couldn't finish within the timeout +* There are more failed worklfow than expected(configured by `failureThreshold`) + ### Cron `Cron` itself is not a test. It is responsible for running multiple other tests in parallel or sequential according a cron schedule. @@ -74,16 +112,6 @@ A sample cron configuration is in `config/bench/cron.json`, and it can be starte cadence --do wf start --tl cadence-bench-tl-0 --wt cron-test-workflow --dt 30 --et 7200 --if config/bench/cron.json ``` -### Basic -As the name suggests, this load tests the basic case of starting workflows and running activities in sequential/parallel. Once all test workflows are started, it will wait test workflow timeout + 5 mins before checking the status of all test workflows. If the failure rate is too high, or if there's any open workflows found, the test will fail. - -The basic load can also be run in "panic" mode by setting `"panicStressWorkflow": true,` to test if server can handle large number of panic workflows (which can be caused by a bad worker deployment). - -Sample configuration can be found in `config/bench/basic.json` and `config/ben/basic_panic.json`. To start the test, a sample command can be -``` -cadence --do wf start --tl cadence-bench-tl-0 --wt basic-load-test-workflow --dt 30 --et 3600 --if config/bench/basic.json -``` - ### Cancellation The load tests the StartWorkflowExecution and CancelWorkflowExecution sync API, and validates the number of cancelled workflows and if there's any open workflow. diff --git a/bench/load/basic/launchWorkflow.go b/bench/load/basic/launchWorkflow.go index 9597cadb336..31843ad3052 100644 --- a/bench/load/basic/launchWorkflow.go +++ b/bench/load/basic/launchWorkflow.go @@ -239,7 +239,7 @@ func launcherActivity(ctx context.Context, params launcherActivityParams) (activ logger.Debug("Created Workflow", zap.String("WorkflowID", we.ID), zap.String("RunID", we.RunID)) } else { res.IncFailed() - logger.Error("Failed to start workflow execution", zap.String("WorkflowID", we.ID), zap.Error(err)) + logger.Error("Failed to start workflow execution", zap.Error(err)) } activity.RecordHeartbeat(ctx, i) jitter := time.Duration(75 + rand.Intn(25)) @@ -286,6 +286,7 @@ func verifyResultActivity( } // verify the number of failed workflows + moreFailedThanExpected := false query := fmt.Sprintf( failedWorkflowQuery, stressWorkflowName, @@ -297,9 +298,35 @@ func verifyResultActivity( } resp, err := cc.CountWorkflow(ctx, request) if err != nil { + if _, ok := err.(*shared.BadRequestError); ok { + // when cluster doesn't have advanced visibility, use ListCLoseWorkflow API instead + maxPageSize := int32(params.FailedWorkflowCount) + 1 + listWorkflowRequest := &shared.ListClosedWorkflowExecutionsRequest{ + Domain: c.StringPtr(info.WorkflowDomain), + MaximumPageSize: &maxPageSize, + StartTimeFilter: &shared.StartTimeFilter{ + EarliestTime: c.Int64Ptr(params.WorkflowStartTime), + LatestTime: c.Int64Ptr(time.Now().UnixNano()), + }, + TypeFilter: &shared.WorkflowTypeFilter{ + Name: c.StringPtr(stressWorkflowName), + }, + } + closedWorkflow, err := cc.ListClosedWorkflow(ctx, listWorkflowRequest) + if err != nil { + return err + } + if len(closedWorkflow.Executions) > int(params.FailedWorkflowCount) { + moreFailedThanExpected = true + } + return nil + } return err } if resp.GetCount() > params.FailedWorkflowCount { + moreFailedThanExpected = true + } + if moreFailedThanExpected { return cadence.NewCustomError( errReasonValidationFailed, fmt.Sprintf("found too many failed workflows(%v) after basic load test completed", params.FailedWorkflowCount), diff --git a/docker/README.md b/docker/README.md index a52118cf44f..0c93fc4ad5e 100644 --- a/docker/README.md +++ b/docker/README.md @@ -47,6 +47,18 @@ docker-compose -f docker-compose-mysql.yml up Also feel free to make your own to combine the above features. +Run canary and bench(load test) +----------------------- +After a local cadence server started, use the below command to run canary ro bench test +``` +docker-compose -f docker-compose-bench.yml up +``` +and +``` +docker-compose -f docker-compose-canary.yml up +``` + + Using a released image ----------------------- The above compose files all using master image. It's taking the latest bits on the master branch of this repo. diff --git a/docker/config/bench/development.yaml b/docker/config/bench/development.yaml new file mode 100644 index 00000000000..2415b2aa262 --- /dev/null +++ b/docker/config/bench/development.yaml @@ -0,0 +1,12 @@ +log: + stdout: true + level: info + +bench: + name: "cadence-bench" + domains: ["cadence-bench", "cadence-bench-sync", "cadence-bench-batch"] + numTaskLists: 2 + +cadence: + service: "cadence-frontend" + host: "host.docker.internal:7933" # see https://docs.docker.com/desktop/mac/networking/ diff --git a/docker/config/canary/development.yaml b/docker/config/canary/development.yaml new file mode 100644 index 00000000000..ed73dafdf71 --- /dev/null +++ b/docker/config/canary/development.yaml @@ -0,0 +1,11 @@ +log: + stdout: true + level: info + +canary: + domains: ["cadence-canary"] + excludes: ["workflow.searchAttributes", "workflow.batch", "workflow.archival.visibility"] + +cadence: + service: "cadence-frontend" + host: "host.docker.internal:7933" # see https://docs.docker.com/desktop/mac/networking/ diff --git a/docker/docker-compose-bench.yml b/docker/docker-compose-bench.yml new file mode 100644 index 00000000000..8e8045edd67 --- /dev/null +++ b/docker/docker-compose-bench.yml @@ -0,0 +1,6 @@ +version: '3' +services: + cadence-bench: + image: ubercadence/cadence-bench:latest + volumes: + - ./config/bench:/etc/cadence-bench/config/bench diff --git a/docker/docker-compose-canary.yml b/docker/docker-compose-canary.yml new file mode 100644 index 00000000000..165d6a2f10e --- /dev/null +++ b/docker/docker-compose-canary.yml @@ -0,0 +1,6 @@ +version: '3' +services: + cadence-canary: + image: ubercadence/cadence-canary:latest + volumes: + - ./config/canary:/etc/cadence-canary/config/canary