diff --git a/docs/environment-variables.md b/docs/environment-variables.md index e01d848a8879..efe9a030675e 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,50 +8,51 @@ most users. Environment variables may be removed at any time. ## Controller -| Name | Type | Default | Description | -|----------------------------------------|---------------------|---------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ARGO_AGENT_TASK_WORKERS` | `int` | `16` | The number of task workers for the agent pod. | -| `ALL_POD_CHANGES_SIGNIFICANT` | `bool` | `false` | Whether to consider all pod changes as significant during pod reconciliation. | -| `ALWAYS_OFFLOAD_NODE_STATUS` | `bool` | `false` | Whether to always offload the node status. | -| `ARCHIVED_WORKFLOW_GC_PERIOD` | `time.Duration` | `24h` | The periodicity for GC of archived workflows. | -| `ARGO_PPROF` | `bool` | `false` | Enable `pprof` endpoints | -| `ARGO_PROGRESS_PATCH_TICK_DURATION` | `time.Duration` | `1m` | How often self reported progress is patched into the pod annotations which means how long it takes until the controller picks up the progress change. Set to 0 to disable self reporting progress. | -| `ARGO_PROGRESS_FILE_TICK_DURATION` | `time.Duration` | `3s` | How often the progress file is read by the executor. Set to 0 to disable self reporting progress. | -| `ARGO_REMOVE_PVC_PROTECTION_FINALIZER` | `bool` | `true` | Remove the `kubernetes.io/pvc-protection` finalizer from persistent volume claims (PVC) after marking PVCs created for the workflow for deletion, so deleted is not blocked until the pods are deleted. [#6629](https://github.com/argoproj/argo-workflows/issues/6629) | -| `ARGO_TRACE` | `string` | `` | Whether to enable tracing statements in Argo components. | -| `ARGO_AGENT_PATCH_RATE` | `time.Duration` | `DEFAULT_REQUEUE_TIME` | Rate that the Argo Agent will patch the workflow task-set. | -| `ARGO_AGENT_CPU_LIMIT` | `resource.Quantity` | `100m` | CPU resource limit for the agent. | -| `ARGO_AGENT_MEMORY_LIMIT` | `resource.Quantity` | `256m` | Memory resource limit for the agent. | -| `BUBBLE_ENTRY_TEMPLATE_ERR` | `bool` | `true` | Whether to bubble up template errors to workflow. | -| `CACHE_GC_PERIOD` | `time.Duration` | `0s` | How often to perform memoization cache GC, which is disabled by default and can be enabled by providing a non-zero duration. | -| `CACHE_GC_AFTER_NOT_HIT_DURATION` | `time.Duration` | `30s` | When a memoization cache has not been hit after this duration, it will be deleted. | -| `CRON_SYNC_PERIOD` | `time.Duration` | `10s` | How often to sync cron workflows. | -| `DEFAULT_REQUEUE_TIME` | `time.Duration` | `10s` | The re-queue time for the rate limiter of the workflow queue. | -| `DISABLE_MAX_RECURSION` | `bool` | `false` | Set to true to disable the recursion preventer, which will stop a workflow running which has called into a child template 100 times | -| `EXPRESSION_TEMPLATES` | `bool` | `true` | Escape hatch to disable expression templates. | -| `EVENT_AGGREGATION_WITH_ANNOTATIONS` | `bool` | `false` | Whether event annotations will be used when aggregating events. | -| `GRPC_MESSAGE_SIZE` | `string` | Use different GRPC Max message size for Argo server deployment (supporting huge workflows). | -| `GZIP_IMPLEMENTATION` | `string` | `PGZip` | The implementation of compression/decompression. Currently only "`PGZip`" and "`GZip`" are supported. | -| `INFORMER_WRITE_BACK` | `bool` | `true` | Whether to write back to informer instead of catching up. | -| `HEALTHZ_AGE` | `time.Duration` | `5m` | How old a un-reconciled workflow is to report unhealthy. | -| `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | `true` | Whether or not to index semaphores. | -| `LEADER_ELECTION_IDENTITY` | `string` | Controller's `metadata.name` | The ID used for workflow controllers to elect a leader. | -| `LEADER_ELECTION_DISABLE` | `bool` | `false` | Whether leader election should be disabled. | -| `LEADER_ELECTION_LEASE_DURATION` | `time.Duration` | `15s` | The duration that non-leader candidates will wait to force acquire leadership. | -| `LEADER_ELECTION_RENEW_DEADLINE` | `time.Duration` | `10s` | The duration that the acting master will retry refreshing leadership before giving up. | -| `LEADER_ELECTION_RETRY_PERIOD` | `time.Duration` | `5s` | The duration that the leader election clients should wait between tries of actions. | -| `MAX_OPERATION_TIME` | `time.Duration` | `30s` | The maximum time a workflow operation is allowed to run for before re-queuing the workflow onto the work queue. | -| `OFFLOAD_NODE_STATUS_TTL` | `time.Duration` | `5m` | The TTL to delete the offloaded node status. Currently only used for testing. | -| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. | -| `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. | -| `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. | -| `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. | -| `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. | -| `RETRY_HOST_NAME_LABEL_KEY` | `string` | `kubernetes.io/hostname` | The label key for host name used when retrying templates. | -| `TRANSIENT_ERROR_PATTERN` | `string` | `""` | The regular expression that represents additional patterns for transient errors. | -| `WF_DEL_PROPAGATION_POLICY` | `string` | `""` | The deletion propagation policy for workflows. | -| `WORKFLOW_GC_PERIOD` | `time.Duration` | `5m` | The periodicity for GC of workflows. | -| `SEMAPHORE_NOTIFY_DELAY` | `time.Duration` | `1s` | Tuning Delay when notifying semaphore waiters about availability in the semaphore | +| Name | Type | Default | Description | +|------------------------------------------|---------------------|---------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ARGO_AGENT_TASK_WORKERS` | `int` | `16` | The number of task workers for the agent pod. | +| `ALL_POD_CHANGES_SIGNIFICANT` | `bool` | `false` | Whether to consider all pod changes as significant during pod reconciliation. | +| `ALWAYS_OFFLOAD_NODE_STATUS` | `bool` | `false` | Whether to always offload the node status. | +| `ARCHIVED_WORKFLOW_GC_PERIOD` | `time.Duration` | `24h` | The periodicity for GC of archived workflows. | +| `ARGO_PPROF` | `bool` | `false` | Enable `pprof` endpoints | +| `ARGO_PROGRESS_PATCH_TICK_DURATION` | `time.Duration` | `1m` | How often self reported progress is patched into the pod annotations which means how long it takes until the controller picks up the progress change. Set to 0 to disable self reporting progress. | +| `ARGO_PROGRESS_FILE_TICK_DURATION` | `time.Duration` | `3s` | How often the progress file is read by the executor. Set to 0 to disable self reporting progress. | +| `ARGO_REMOVE_PVC_PROTECTION_FINALIZER` | `bool` | `true` | Remove the `kubernetes.io/pvc-protection` finalizer from persistent volume claims (PVC) after marking PVCs created for the workflow for deletion, so deleted is not blocked until the pods are deleted. [#6629](https://github.com/argoproj/argo-workflows/issues/6629) | +| `ARGO_TRACE` | `string` | `` | Whether to enable tracing statements in Argo components. | +| `ARGO_AGENT_PATCH_RATE` | `time.Duration` | `DEFAULT_REQUEUE_TIME` | Rate that the Argo Agent will patch the workflow task-set. | +| `ARGO_AGENT_CPU_LIMIT` | `resource.Quantity` | `100m` | CPU resource limit for the agent. | +| `ARGO_AGENT_MEMORY_LIMIT` | `resource.Quantity` | `256m` | Memory resource limit for the agent. | +| `BUBBLE_ENTRY_TEMPLATE_ERR` | `bool` | `true` | Whether to bubble up template errors to workflow. | +| `CACHE_GC_PERIOD` | `time.Duration` | `0s` | How often to perform memoization cache GC, which is disabled by default and can be enabled by providing a non-zero duration. | +| `CACHE_GC_AFTER_NOT_HIT_DURATION` | `time.Duration` | `30s` | When a memoization cache has not been hit after this duration, it will be deleted. | +| `CRON_SYNC_PERIOD` | `time.Duration` | `10s` | How often to sync cron workflows. | +| `DEFAULT_REQUEUE_TIME` | `time.Duration` | `10s` | The re-queue time for the rate limiter of the workflow queue. | +| `DISABLE_MAX_RECURSION` | `bool` | `false` | Set to true to disable the recursion preventer, which will stop a workflow running which has called into a child template 100 times | +| `EXPRESSION_TEMPLATES` | `bool` | `true` | Escape hatch to disable expression templates. | +| `EVENT_AGGREGATION_WITH_ANNOTATIONS` | `bool` | `false` | Whether event annotations will be used when aggregating events. | +| `GRPC_MESSAGE_SIZE` | `string` | Use different GRPC Max message size for Argo server deployment (supporting huge workflows). | +| `GZIP_IMPLEMENTATION` | `string` | `PGZip` | The implementation of compression/decompression. Currently only "`PGZip`" and "`GZip`" are supported. | +| `INFORMER_WRITE_BACK` | `bool` | `true` | Whether to write back to informer instead of catching up. | +| `HEALTHZ_AGE` | `time.Duration` | `5m` | How old a un-reconciled workflow is to report unhealthy. | +| `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | `true` | Whether or not to index semaphores. | +| `LEADER_ELECTION_IDENTITY` | `string` | Controller's `metadata.name` | The ID used for workflow controllers to elect a leader. | +| `LEADER_ELECTION_DISABLE` | `bool` | `false` | Whether leader election should be disabled. | +| `LEADER_ELECTION_LEASE_DURATION` | `time.Duration` | `15s` | The duration that non-leader candidates will wait to force acquire leadership. | +| `LEADER_ELECTION_RENEW_DEADLINE` | `time.Duration` | `10s` | The duration that the acting master will retry refreshing leadership before giving up. | +| `LEADER_ELECTION_RETRY_PERIOD` | `time.Duration` | `5s` | The duration that the leader election clients should wait between tries of actions. | +| `MAX_OPERATION_TIME` | `time.Duration` | `30s` | The maximum time a workflow operation is allowed to run for before re-queuing the workflow onto the work queue. | +| `OFFLOAD_NODE_STATUS_TTL` | `time.Duration` | `5m` | The TTL to delete the offloaded node status. Currently only used for testing. | +| `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. | +| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. | +| `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. | +| `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. | +| `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. | +| `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. | +| `RETRY_HOST_NAME_LABEL_KEY` | `string` | `kubernetes.io/hostname` | The label key for host name used when retrying templates. | +| `TRANSIENT_ERROR_PATTERN` | `string` | `""` | The regular expression that represents additional patterns for transient errors. | +| `WF_DEL_PROPAGATION_POLICY` | `string` | `""` | The deletion propagation policy for workflows. | +| `WORKFLOW_GC_PERIOD` | `time.Duration` | `5m` | The periodicity for GC of workflows. | +| `SEMAPHORE_NOTIFY_DELAY` | `time.Duration` | `1s` | Tuning Delay when notifying semaphore waiters about availability in the semaphore | CLI parameters of the `argo-server` and `workflow-controller` can be specified as environment variables with the `ARGO_` prefix. For example: diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index dbd9bd105599..a61516b99a32 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -11,6 +11,7 @@ import ( "k8s.io/client-go/util/workqueue" "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" + envutil "github.com/argoproj/argo-workflows/v3/util/env" ) const ( @@ -20,6 +21,11 @@ const ( DefaultMetricsServerPath = "/metrics" ) +var ( + maxOperationTimeSeconds = envutil.LookupEnvDurationOr("MAX_OPERATION_TIME", 30*time.Second).Seconds() + operationDurationMetricBucketCount = envutil.LookupEnvIntOr("OPERATION_DURATION_METRIC_BUCKET_COUNT", 6) +) + type ServerConfig struct { Enabled bool Path string @@ -72,6 +78,7 @@ func (m *Metrics) Fire(entry *log.Entry) error { var _ prometheus.Collector = &Metrics{} func New(metricsConfig, telemetryConfig ServerConfig) *Metrics { + bucketWidth := maxOperationTimeSeconds / float64(operationDurationMetricBucketCount) metrics := &Metrics{ metricsConfig: metricsConfig, telemetryConfig: telemetryConfig, @@ -79,7 +86,11 @@ func New(metricsConfig, telemetryConfig ServerConfig) *Metrics { podsByPhase: getPodPhaseGauges(), workflowsByPhase: getWorkflowPhaseGauges(), workflows: make(map[string][]string), - operationDurations: newHistogram("operation_duration_seconds", "Histogram of durations of operations", nil, []float64{0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.5, 3.0}), + operationDurations: newHistogram("operation_duration_seconds", + "Histogram of durations of operations", + nil, + // We start the bucket with `bucketWidth` since lowest bucket has an upper bound of 'start'. + prometheus.LinearBuckets(bucketWidth, bucketWidth, operationDurationMetricBucketCount)), errors: getErrorCounters(), customMetrics: make(map[string]metric), workqueueMetrics: make(map[string]prometheus.Metric), diff --git a/workflow/metrics/metrics_test.go b/workflow/metrics/metrics_test.go index ff03b70f5340..226efe293bd0 100644 --- a/workflow/metrics/metrics_test.go +++ b/workflow/metrics/metrics_test.go @@ -62,8 +62,9 @@ func TestMetrics(t *testing.T) { } m := New(config, config) - m.OperationCompleted(0.05) - assert.Equal(t, uint64(1), *write(m.operationDurations).Histogram.Bucket[0].CumulativeCount) + // Default buckets: {5, 10, 15, 20, 25, 30} + m.OperationCompleted(5) + assert.Equal(t, 1, int(*write(m.operationDurations).Histogram.Bucket[1].CumulativeCount)) assert.Nil(t, m.GetCustomMetric("does-not-exist"))