From 38c2e565245b32ff34bc5c3f30603c6a7642e27e Mon Sep 17 00:00:00 2001 From: Willem Kaufmann Date: Thu, 18 Jul 2024 15:04:21 -0400 Subject: [PATCH] rpk/cluster: update `rpk self-test` docs Updated with changes made in redpanda-docs here: https://github.com/redpanda-data/docs/pull/599 --- src/go/rpk/pkg/cli/cluster/selftest/start.go | 48 +++++++++---------- src/go/rpk/pkg/cli/cluster/selftest/status.go | 23 +++++++-- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/go/rpk/pkg/cli/cluster/selftest/start.go b/src/go/rpk/pkg/cli/cluster/selftest/start.go index 1c64b6891fe68..344aabd041560 100644 --- a/src/go/rpk/pkg/cli/cluster/selftest/start.go +++ b/src/go/rpk/pkg/cli/cluster/selftest/start.go @@ -37,32 +37,32 @@ func newStartCommand(fs afero.Fs, p *config.Params) *cobra.Command { cmd := &cobra.Command{ Use: "start", Short: "Starts a new self-test run", - Long: `Starts one or more benchmark tests on one or more nodes -of the cluster. Available tests to run: + Long: `Starts one or more benchmark tests on one or more nodes of the cluster. -* Disk tests: - * Throughput test: 512 KB messages, sequential read/write - * Uses a larger request message sizes and deeper I/O queue depth to write/read more bytes in a shorter amount of time, at the cost of IOPS/latency. - * Latency test: 4 KB messages, sequential read/write - * Uses smaller request message sizes and lower levels of parallelism to achieve higher IOPS and lower latency. +NOTE: Redpanda self-test runs benchmarks that consume significant system resources. Do not start self-test if large workloads are already running on the system. + +Available tests to run: +* Disk tests: + ** Throughput test: 512 KB messages, sequential read/write + *** Uses a larger request message sizes and deeper I/O queue depth to write/read more bytes in a shorter amount of time, at the cost of IOPS/latency. + ** Latency test: 4 KB messages, sequential read/write + *** Uses smaller request message sizes and lower levels of parallelism to achieve higher IOPS and lower latency. * Network tests: - * Throughput test: 8192-bit messages - * Unique pairs of Redpanda nodes each act as a client and a server. - * The test pushes as much data over the wire, within the test parameters. - -* Cloud tests: - * Latency test: 1024-bit object. - * Depending on cluster read/write permissions (cloud_storage_enable_remote_read, cloud_storage_enable_remote_write), a series of cloud storage operations are performed: - * 1. Upload an object to an S3 bucket. - * 2. List objects in the bucket. - * 3. Download an object from the bucket. - * 4. Delete the original object from the bucket, if it was uploaded. - - -This command immediately returns on success, and the tests run asynchronously. The -user polls for results with the 'self-test status' -command.`, + ** Throughput test: 8192-bit messages + *** Unique pairs of Redpanda nodes each act as a client and a server. + *** The test pushes as much data over the wire, within the test parameters. +* Cloud storage tests + ** Latency test: 1024-bit object. + ** Depending on cluster read/write permissions ('cloud_storage_enable_remote_read', 'cloud_storage_enable_remote_write'), a series of cloud storage operations are performed: + *** Upload an object to an object storage. + *** List objects in the object storage. + *** Download an object from the object storage. + *** Delete the original object from the object storage, if it was uploaded. + +This command prompts users for confirmation (unless the flag '--no-confirm' is specified), then returns a test identifier ID, and runs the tests. + +To view the test status, poll 'rpk cluster self-test status'. Once the tests end, the cached results will be available with 'rpk cluster self-test status'.`, Args: cobra.ExactArgs(0), Run: func(cmd *cobra.Command, _ []string) { // Load config settings @@ -104,7 +104,7 @@ command.`, cmd.Flags().UintVar(&cloudBackoffMs, "cloud-backoff-ms", 100, "The backoff in milliseconds for a cloud storage request") cmd.Flags().IntSliceVar(&onNodes, "participant-node-ids", nil, - "IDs of nodes that the tests will run on. If not set, tests will run for all node IDs.") + "Comma-separated list of broker IDs that the tests will run on. If not set, tests will run for all node IDs.") cmd.Flags().BoolVar(&onlyDisk, "only-disk-test", false, "Runs only the disk benchmarks") cmd.Flags().BoolVar(&onlyNetwork, "only-network-test", false, "Runs only network benchmarks") cmd.Flags().BoolVar(&onlyCloud, "only-cloud-test", false, "Runs only cloud storage benchmarks") diff --git a/src/go/rpk/pkg/cli/cluster/selftest/status.go b/src/go/rpk/pkg/cli/cluster/selftest/status.go index 9c79a69d08c97..d598c38d23947 100644 --- a/src/go/rpk/pkg/cli/cluster/selftest/status.go +++ b/src/go/rpk/pkg/cli/cluster/selftest/status.go @@ -35,17 +35,32 @@ func newStatusCommand(fs afero.Fs, p *config.Params) *cobra.Command { var format string cmd := &cobra.Command{ Use: "status", - Short: "Queries the status of the currently running or last completed self-test run", - Long: `Returns the status of the currently running or last completed self-test run. + Short: "Returns the status of the current running tests or the cached results of the last completed run.", + Long: `Returns the status of the current running tests or the cached results of the last completed run. Use this command after invoking 'self-test start' to determine the status of the jobs launched. Possible results are: * One or more jobs still running - * Returns the IDs of Redpanda nodes still running self-tests. + * Returns the IDs of Redpanda brokers (nodes) still running self-tests. * No jobs running: - * Returns cached results for all nodes of the last completed test. + * Returns the cached results for all brokers of the last completed test. + +Test results are grouped by broker ID. Each test returns the following: + +* Name: Description of the test. +* Info: Details about the test run attached by Redpanda. +* Type: Either 'disk', 'network', or 'cloud' test. +* Test Id: Unique identifier given to jobs of a run. All IDs in a test should match. If they don't match, then newer and/or older test results have been included erroneously. +* Timeouts: Number of timeouts incurred during the test. +* Start time: Time that the test started, in UTC. +* End time: Time that the test ended, in UTC. +* Avg Duration: Duration of the test. +* IOPS: Number of operations per second. For disk, it's 'seastar::dma_read' and 'seastar::dma_write'. For network, it's 'rpc.send()'. +* Throughput: For disk, throughput rate is in bytes per second. For network, throughput rate is in bits per second. Note that GiB vs. Gib is the correct notation displayed by the UI. +* Latency: 50th, 90th, etc. percentiles of operation latency, reported in microseconds (μs). Represented as P50, P90, P99, P999, and MAX respectively. +If Tiered Storage is not enabled, the cloud storage tests won't run and a warning will be displayed showing "Cloud storage is not enabled.". All results will be shown as 0. `, Args: cobra.ExactArgs(0), Run: func(cmd *cobra.Command, _ []string) {