diff --git a/deploy/kubernetes/dolphinscheduler/README.md b/deploy/kubernetes/dolphinscheduler/README.md index b2e9974447f0c..8f2c2d1474529 100644 --- a/deploy/kubernetes/dolphinscheduler/README.md +++ b/deploy/kubernetes/dolphinscheduler/README.md @@ -10,341 +10,347 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst ## Values -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| alert.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | -| alert.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | -| alert.enabled | bool | `true` | Enable or disable the Alert-Server component | -| alert.env.JAVA_OPTS | string | `"-Xms512m -Xmx512m -Xmn256m"` | The jvm options for alert server | -| alert.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| alert.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | -| alert.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| alert.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | -| alert.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| alert.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| alert.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| alert.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | -| alert.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. More info: [persistentvolumeclaims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) | -| alert.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | -| alert.persistentVolumeClaim.enabled | bool | `false` | Set `alert.persistentVolumeClaim.enabled` to `true` to mount a new volume for `alert` | -| alert.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| alert.persistentVolumeClaim.storageClassName | string | `"-"` | `Alert` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| alert.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| alert.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | -| alert.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| alert.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | -| alert.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| alert.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| alert.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| alert.replicas | int | `1` | Number of desired pods. This is a pointer to distinguish between explicit zero and not specified. Defaults to 1. | -| alert.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | -| alert.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | -| alert.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | -| alert.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | -| alert.service.serviceMonitor.enabled | bool | `false` | Enable or disable alert-server serviceMonitor | -| alert.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | -| alert.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | -| alert.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | -| alert.strategy | object | `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"25%"},"type":"RollingUpdate"}` | The deployment strategy to use to replace existing pods with new ones. | -| alert.strategy.rollingUpdate.maxSurge | string | `"25%"` | The maximum number of pods that can be scheduled above the desired number of pods | -| alert.strategy.rollingUpdate.maxUnavailable | string | `"25%"` | The maximum number of pods that can be unavailable during the update | -| alert.strategy.type | string | `"RollingUpdate"` | Type of deployment. Can be "Recreate" or "RollingUpdate" | -| alert.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | -| api.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | -| api.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | -| api.enabled | bool | `true` | Enable or disable the API-Server component | -| api.env.JAVA_OPTS | string | `"-Xms512m -Xmx512m -Xmn256m"` | The jvm options for api server | -| api.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| api.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | -| api.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| api.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | -| api.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| api.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| api.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| api.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | -| api.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. More info: [persistentvolumeclaims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) | -| api.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | -| api.persistentVolumeClaim.enabled | bool | `false` | Set `api.persistentVolumeClaim.enabled` to `true` to mount a new volume for `api` | -| api.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| api.persistentVolumeClaim.storageClassName | string | `"-"` | `api` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| api.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| api.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | -| api.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| api.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | -| api.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| api.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| api.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| api.replicas | string | `"1"` | Number of desired pods. This is a pointer to distinguish between explicit zero and not specified. Defaults to 1. | -| api.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | -| api.service.annotations | object | `{}` | annotations may need to be set when service.type is LoadBalancer service.beta.kubernetes.io/aws-load-balancer-ssl-cert: arn:aws:acm:us-east-1:EXAMPLE_CERT | -| api.service.clusterIP | string | `""` | clusterIP is the IP address of the service and is usually assigned randomly by the master | -| api.service.externalIPs | list | `[]` | externalIPs is a list of IP addresses for which nodes in the cluster will also accept traffic for this service | -| api.service.externalName | string | `""` | externalName is the external reference that kubedns or equivalent will return as a CNAME record for this service, requires Type to be ExternalName | -| api.service.loadBalancerIP | string | `""` | loadBalancerIP when service.type is LoadBalancer. LoadBalancer will get created with the IP specified in this field | -| api.service.nodePort | string | `""` | nodePort is the port on each node on which this api service is exposed when type=NodePort | -| api.service.pythonNodePort | string | `""` | pythonNodePort is the port on each node on which this python api service is exposed when type=NodePort | -| api.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/dolphinscheduler/actuator/prometheus"}` | serviceMonitor for prometheus operator | -| api.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | -| api.service.serviceMonitor.enabled | bool | `false` | Enable or disable api-server serviceMonitor | -| api.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | -| api.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | -| api.service.serviceMonitor.path | string | `"/dolphinscheduler/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | -| api.service.type | string | `"ClusterIP"` | type determines how the Service is exposed. Defaults to ClusterIP. Valid options are ExternalName, ClusterIP, NodePort, and LoadBalancer | -| api.strategy | object | `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"25%"},"type":"RollingUpdate"}` | The deployment strategy to use to replace existing pods with new ones. | -| api.strategy.rollingUpdate.maxSurge | string | `"25%"` | The maximum number of pods that can be scheduled above the desired number of pods | -| api.strategy.rollingUpdate.maxUnavailable | string | `"25%"` | The maximum number of pods that can be unavailable during the update | -| api.strategy.type | string | `"RollingUpdate"` | Type of deployment. Can be "Recreate" or "RollingUpdate" | -| api.taskTypeFilter.enabled | bool | `false` | Enable or disable the task type filter. If set to true, the API-Server will return tasks of a specific type set in api.taskTypeFilter.task Note: This feature only filters tasks to return a specific type on the WebUI. However, you can still create any task that DolphinScheduler supports via the API. | -| api.taskTypeFilter.task | object | `{}` | ref: [task-type-config.yaml](https://github.com/apache/dolphinscheduler/blob/dev/dolphinscheduler-api/src/main/resources/task-type-config.yaml) | -| api.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | -| common.configmap.DATAX_LAUNCHER | string | `"/opt/soft/datax/bin/datax.py"` | Set `DATAX_LAUNCHER` for DolphinScheduler's task environment | -| common.configmap.DATA_BASEDIR_PATH | string | `"/tmp/dolphinscheduler"` | User data directory path, self configuration, please make sure the directory exists and have read write permissions | -| common.configmap.DOLPHINSCHEDULER_OPTS | string | `""` | The jvm options for dolphinscheduler, suitable for all servers | -| common.configmap.FLINK_HOME | string | `"/opt/soft/flink"` | Set `FLINK_HOME` for DolphinScheduler's task environment | -| common.configmap.HADOOP_CONF_DIR | string | `"/opt/soft/hadoop/etc/hadoop"` | Set `HADOOP_CONF_DIR` for DolphinScheduler's task environment | -| common.configmap.HADOOP_HOME | string | `"/opt/soft/hadoop"` | Set `HADOOP_HOME` for DolphinScheduler's task environment | -| common.configmap.HIVE_HOME | string | `"/opt/soft/hive"` | Set `HIVE_HOME` for DolphinScheduler's task environment | -| common.configmap.JAVA_HOME | string | `"/opt/java/openjdk"` | Set `JAVA_HOME` for DolphinScheduler's task environment | -| common.configmap.PYTHON_LAUNCHER | string | `"/usr/bin/python/bin/python3"` | Set `PYTHON_LAUNCHER` for DolphinScheduler's task environment | -| common.configmap.RESOURCE_UPLOAD_PATH | string | `"/dolphinscheduler"` | Resource store on HDFS/S3 path, please make sure the directory exists on hdfs and have read write permissions | -| common.configmap.SPARK_HOME | string | `"/opt/soft/spark"` | Set `SPARK_HOME` for DolphinScheduler's task environment | -| common.fsFileResourcePersistence.accessModes | list | `["ReadWriteMany"]` | `PersistentVolumeClaim` access modes, must be `ReadWriteMany` | -| common.fsFileResourcePersistence.enabled | bool | `false` | Set `common.fsFileResourcePersistence.enabled` to `true` to mount a new file resource volume for `api` and `worker` | -| common.fsFileResourcePersistence.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| common.fsFileResourcePersistence.storageClassName | string | `"-"` | Resource persistent volume storage class, must support the access mode: `ReadWriteMany` | -| common.sharedStoragePersistence.accessModes | list | `["ReadWriteMany"]` | `PersistentVolumeClaim` access modes, must be `ReadWriteMany` | -| common.sharedStoragePersistence.enabled | bool | `false` | Set `common.sharedStoragePersistence.enabled` to `true` to mount a shared storage volume for Hadoop, Spark binary and etc | -| common.sharedStoragePersistence.mountPath | string | `"/opt/soft"` | The mount path for the shared storage volume | -| common.sharedStoragePersistence.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| common.sharedStoragePersistence.storageClassName | string | `"-"` | Shared Storage persistent volume storage class, must support the access mode: ReadWriteMany | -| conf.auto | bool | `false` | auto restart, if true, all components will be restarted automatically after the common configuration is updated. if false, you need to restart the components manually. default is false | -| conf.common."alert.rpc.port" | int | `50052` | rpc port | -| conf.common."appId.collect" | string | `"log"` | way to collect applicationId: log, aop | -| conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh | -| conf.common."data-quality.jar.name" | string | `"dolphinscheduler-data-quality-dev-SNAPSHOT.jar"` | data quality option | -| conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions | -| conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable | -| conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt | -| conf.common."development.state" | bool | `false` | development state | -| conf.common."hadoop.security.authentication.startup.state" | bool | `false` | whether to startup kerberos | -| conf.common."java.security.krb5.conf.path" | string | `"/opt/krb5.conf"` | java.security.krb5.conf path | -| conf.common."kerberos.expire.time" | int | `2` | kerberos expire time, the unit is hour | -| conf.common."login.user.keytab.path" | string | `"/opt/hdfs.headless.keytab"` | login user from keytab path | -| conf.common."login.user.keytab.username" | string | `"hdfs-mycluster@ESZ.COM"` | login user from keytab username | -| conf.common."ml.mlflow.preset_repository" | string | `"https://github.com/apache/dolphinscheduler-mlflow"` | mlflow task plugin preset repository | -| conf.common."ml.mlflow.preset_repository_version" | string | `"main"` | mlflow task plugin preset repository version | -| conf.common."resource.alibaba.cloud.access.key.id" | string | `""` | alibaba cloud access key id, required if you set resource.storage.type=OSS | -| conf.common."resource.alibaba.cloud.access.key.secret" | string | `""` | alibaba cloud access key secret, required if you set resource.storage.type=OSS | -| conf.common."resource.alibaba.cloud.oss.bucket.name" | string | `"dolphinscheduler"` | oss bucket name, required if you set resource.storage.type=OSS | -| conf.common."resource.alibaba.cloud.oss.endpoint" | string | `"https://oss-cn-hangzhou.aliyuncs.com"` | oss bucket endpoint, required if you set resource.storage.type=OSS | -| conf.common."resource.alibaba.cloud.region" | string | `"cn-hangzhou"` | alibaba cloud region, required if you set resource.storage.type=OSS | -| conf.common."resource.aws.access.key.id" | string | `"minioadmin"` | The AWS access key. if resource.storage.type=S3 or use EMR-Task, This configuration is required | -| conf.common."resource.aws.region" | string | `"ca-central-1"` | The AWS Region to use. if resource.storage.type=S3 or use EMR-Task, This configuration is required | -| conf.common."resource.aws.s3.bucket.name" | string | `"dolphinscheduler"` | The name of the bucket. You need to create them by yourself. Otherwise, the system cannot start. All buckets in Amazon S3 share a single namespace; ensure the bucket is given a unique name. | -| conf.common."resource.aws.s3.endpoint" | string | `"http://minio:9000"` | You need to set this parameter when private cloud s3. If S3 uses public cloud, you only need to set resource.aws.region or set to the endpoint of a public cloud such as S3.cn-north-1.amazonaws.com.cn | -| conf.common."resource.aws.secret.access.key" | string | `"minioadmin"` | The AWS secret access key. if resource.storage.type=S3 or use EMR-Task, This configuration is required | -| conf.common."resource.azure.client.id" | string | `"minioadmin"` | azure storage account name, required if you set resource.storage.type=ABS | -| conf.common."resource.azure.client.secret" | string | `"minioadmin"` | azure storage account key, required if you set resource.storage.type=ABS | -| conf.common."resource.azure.subId" | string | `"minioadmin"` | azure storage subId, required if you set resource.storage.type=ABS | -| conf.common."resource.azure.tenant.id" | string | `"minioadmin"` | azure storage tenantId, required if you set resource.storage.type=ABS | -| conf.common."resource.hdfs.fs.defaultFS" | string | `"hdfs://mycluster:8020"` | if resource.storage.type=S3, the value like: s3a://dolphinscheduler; if resource.storage.type=HDFS and namenode HA is enabled, you need to copy core-site.xml and hdfs-site.xml to conf dir | -| conf.common."resource.hdfs.root.user" | string | `"hdfs"` | if resource.storage.type=HDFS, the user must have the permission to create directories under the HDFS root path | -| conf.common."resource.manager.httpaddress.port" | int | `8088` | resourcemanager port, the default value is 8088 if not specified | -| conf.common."resource.storage.type" | string | `"S3"` | resource storage type: HDFS, S3, OSS, GCS, ABS, NONE | -| conf.common."resource.storage.upload.base.path" | string | `"/dolphinscheduler"` | resource store on HDFS/S3 path, resource file will store to this base path, self configuration, please make sure the directory exists on hdfs and have read write permissions. "/dolphinscheduler" is recommended | -| conf.common."sudo.enable" | bool | `true` | use sudo or not, if set true, executing user is tenant user and deploy user needs sudo permissions; if set false, executing user is the deploy user and doesn't need sudo permissions | -| conf.common."support.hive.oneSession" | bool | `false` | Whether hive SQL is executed in the same session | -| conf.common."task.resource.limit.state" | bool | `false` | Task resource limit state | -| conf.common."yarn.application.status.address" | string | `"http://ds1:%s/ws/v1/cluster/apps/%s"` | if resourcemanager HA is enabled or not use resourcemanager, please keep the default value; If resourcemanager is single, you only need to replace ds1 to actual resourcemanager hostname | -| conf.common."yarn.job.history.status.address" | string | `"http://ds1:19888/ws/v1/history/mapreduce/jobs/%s"` | job history status url when application number threshold is reached(default 10000, maybe it was set to 1000) | -| conf.common."yarn.resourcemanager.ha.rm.ids" | string | `"192.168.xx.xx,192.168.xx.xx"` | if resourcemanager HA is enabled, please set the HA IPs; if resourcemanager is single, keep this value empty | -| externalDatabase.database | string | `"dolphinscheduler"` | The database of external database | -| externalDatabase.driverClassName | string | `"org.postgresql.Driver"` | The driverClassName of external database | -| externalDatabase.enabled | bool | `false` | If exists external database, and set postgresql.enable value to false. external database will be used, otherwise Dolphinscheduler's internal database will be used. | -| externalDatabase.host | string | `"localhost"` | The host of external database | -| externalDatabase.params | string | `"characterEncoding=utf8"` | The params of external database | -| externalDatabase.password | string | `"root"` | The password of external database | -| externalDatabase.port | string | `"5432"` | The port of external database | -| externalDatabase.type | string | `"postgresql"` | The type of external database, supported types: postgresql, mysql | -| externalDatabase.username | string | `"root"` | The username of external database | -| externalRegistry.registryPluginName | string | `"zookeeper"` | If exists external registry and set `zookeeper.enable` && `registryEtcd.enabled` && `registryJdbc.enabled` to false, specify the external registry plugin name | -| externalRegistry.registryServers | string | `"127.0.0.1:2181"` | If exists external registry and set `zookeeper.enable` && `registryEtcd.enabled` && `registryJdbc.enabled` to false, specify the external registry servers | -| image.alert | string | `"dolphinscheduler-alert-server"` | alert-server image | -| image.api | string | `"dolphinscheduler-api"` | api-server image | -| image.master | string | `"dolphinscheduler-master"` | master image | -| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. Options: Always, Never, IfNotPresent | -| image.pullSecret | string | `""` | Specify a imagePullSecrets | -| image.registry | string | `"apache/dolphinscheduler"` | Docker image repository for the DolphinScheduler | -| image.tag | string | `"latest"` | Docker image version for the DolphinScheduler | -| image.tools | string | `"dolphinscheduler-tools"` | tools image | -| image.worker | string | `"dolphinscheduler-worker"` | worker image | -| ingress.annotations | object | `{}` | Ingress annotations | -| ingress.enabled | bool | `false` | Enable ingress | -| ingress.host | string | `"dolphinscheduler.org"` | Ingress host | -| ingress.path | string | `"/dolphinscheduler"` | Ingress path | -| ingress.tls.enabled | bool | `false` | Enable ingress tls | -| ingress.tls.secretName | string | `"dolphinscheduler-tls"` | Ingress tls secret name | -| initImage | object | `{"busybox":"busybox:1.30.1","pullPolicy":"IfNotPresent"}` | Used to detect whether dolphinscheduler dependent services such as database are ready | -| initImage.busybox | string | `"busybox:1.30.1"` | Specify initImage repository | -| initImage.pullPolicy | string | `"IfNotPresent"` | Image pull policy. Options: Always, Never, IfNotPresent | -| master.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | -| master.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | -| master.enabled | bool | `true` | Enable or disable the Master component | -| master.env.JAVA_OPTS | string | `"-Xms1g -Xmx1g -Xmn512m"` | The jvm options for master server | -| master.env.MASTER_DISPATCH_TASK_NUM | string | `"3"` | Master dispatch task number per batch | -| master.env.MASTER_EXEC_TASK_NUM | string | `"20"` | Master execute task number in parallel per process instance | -| master.env.MASTER_EXEC_THREADS | string | `"100"` | Master execute thread number to limit process instances | -| master.env.MASTER_FAILOVER_INTERVAL | string | `"10m"` | Master failover interval, the unit is minute | -| master.env.MASTER_HEARTBEAT_ERROR_THRESHOLD | string | `"5"` | Master heartbeat error threshold | -| master.env.MASTER_HEARTBEAT_INTERVAL | string | `"10s"` | Master heartbeat interval, the unit is second | -| master.env.MASTER_HOST_SELECTOR | string | `"LowerWeight"` | Master host selector to select a suitable worker, optional values include Random, RoundRobin, LowerWeight | -| master.env.MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER | string | `"true"` | Master kill application when handle failover | -| master.env.MASTER_MAX_CPU_LOAD_AVG | string | `"1"` | Master max cpuload avg, only higher than the system cpu load average, master server can schedule | -| master.env.MASTER_RESERVED_MEMORY | string | `"0.3"` | Master reserved memory, only lower than system available memory, master server can schedule, the unit is G | -| master.env.MASTER_STATE_WHEEL_INTERVAL | string | `"5s"` | master state wheel interval, the unit is second | -| master.env.MASTER_TASK_COMMIT_INTERVAL | string | `"1s"` | master commit task interval, the unit is second | -| master.env.MASTER_TASK_COMMIT_RETRYTIMES | string | `"5"` | Master commit task retry times | -| master.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| master.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | -| master.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| master.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | -| master.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| master.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| master.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| master.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | -| master.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. The StatefulSet controller is responsible for mapping network identities to claims in a way that maintains the identity of a pod. Every claim in this list must have at least one matching (by name) volumeMount in one container in the template. A claim in this list takes precedence over any volumes in the template, with the same name. | -| master.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | -| master.persistentVolumeClaim.enabled | bool | `false` | Set `master.persistentVolumeClaim.enabled` to `true` to mount a new volume for `master` | -| master.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| master.persistentVolumeClaim.storageClassName | string | `"-"` | `Master` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| master.podManagementPolicy | string | `"Parallel"` | PodManagementPolicy controls how pods are created during initial scale up, when replacing pods on nodes, or when scaling down. | -| master.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| master.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | -| master.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| master.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | -| master.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| master.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| master.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| master.replicas | string | `"3"` | Replicas is the desired number of replicas of the given Template. | -| master.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | -| master.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | -| master.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | -| master.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | -| master.service.serviceMonitor.enabled | bool | `false` | Enable or disable master serviceMonitor | -| master.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | -| master.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | -| master.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | -| master.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | -| minio.auth.rootPassword | string | `"minioadmin"` | minio password | -| minio.auth.rootUser | string | `"minioadmin"` | minio username | -| minio.defaultBuckets | string | `"dolphinscheduler"` | minio default buckets | -| minio.enabled | bool | `true` | Deploy minio and configure it as the default storage for DolphinScheduler, note this is for demo only, not for production. | -| minio.persistence.enabled | bool | `false` | Set minio.persistence.enabled to true to mount a new volume for internal minio | -| mysql.auth.database | string | `"dolphinscheduler"` | mysql database | -| mysql.auth.params | string | `"characterEncoding=utf8"` | mysql params | -| mysql.auth.password | string | `"ds"` | mysql password | -| mysql.auth.username | string | `"ds"` | mysql username | -| mysql.driverClassName | string | `"com.mysql.cj.jdbc.Driver"` | mysql driverClassName | -| mysql.enabled | bool | `false` | If not exists external MySQL, by default, the DolphinScheduler will use a internal MySQL | -| mysql.primary.persistence.enabled | bool | `false` | Set mysql.primary.persistence.enabled to true to mount a new volume for internal MySQL | -| mysql.primary.persistence.size | string | `"20Gi"` | `PersistentVolumeClaim` size | -| mysql.primary.persistence.storageClass | string | `"-"` | MySQL data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| postgresql.driverClassName | string | `"org.postgresql.Driver"` | The driverClassName for internal PostgreSQL | -| postgresql.enabled | bool | `true` | If not exists external PostgreSQL, by default, the DolphinScheduler will use a internal PostgreSQL | -| postgresql.params | string | `"characterEncoding=utf8"` | The params for internal PostgreSQL | -| postgresql.persistence.enabled | bool | `false` | Set postgresql.persistence.enabled to true to mount a new volume for internal PostgreSQL | -| postgresql.persistence.size | string | `"20Gi"` | `PersistentVolumeClaim` size | -| postgresql.persistence.storageClass | string | `"-"` | PostgreSQL data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| postgresql.postgresqlDatabase | string | `"dolphinscheduler"` | The database for internal PostgreSQL | -| postgresql.postgresqlPassword | string | `"root"` | The password for internal PostgreSQL | -| postgresql.postgresqlUsername | string | `"root"` | The username for internal PostgreSQL | -| registryEtcd.authority | string | `""` | Etcd authority | -| registryEtcd.enabled | bool | `false` | If you want to use Etcd for your registry center, change this value to true. And set zookeeper.enabled to false | -| registryEtcd.endpoints | string | `""` | Etcd endpoints | -| registryEtcd.namespace | string | `"dolphinscheduler"` | Etcd namespace | -| registryEtcd.passWord | string | `""` | Etcd passWord | -| registryEtcd.ssl.certFile | string | `"etcd-certs/ca.crt"` | CertFile file path | -| registryEtcd.ssl.enabled | bool | `false` | If your Etcd server has configured with ssl, change this value to true. About certification files you can see [here](https://github.com/etcd-io/jetcd/blob/main/docs/SslConfig.md) for how to convert. | -| registryEtcd.ssl.keyCertChainFile | string | `"etcd-certs/client.crt"` | keyCertChainFile file path | -| registryEtcd.ssl.keyFile | string | `"etcd-certs/client.pem"` | keyFile file path | -| registryEtcd.user | string | `""` | Etcd user | -| registryJdbc.enabled | bool | `false` | If you want to use JDbc for your registry center, change this value to true. And set zookeeper.enabled and registryEtcd.enabled to false | -| registryJdbc.hikariConfig.driverClassName | string | `"com.mysql.cj.jdbc.Driver"` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | -| registryJdbc.hikariConfig.enabled | bool | `false` | Default use same Dolphinscheduler's database, if you want to use other database please change `enabled` to `true` and change other configs | -| registryJdbc.hikariConfig.jdbcurl | string | `"jdbc:mysql://"` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | -| registryJdbc.hikariConfig.password | string | `""` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | -| registryJdbc.hikariConfig.username | string | `""` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | -| registryJdbc.termExpireTimes | int | `3` | Used to calculate the expire time | -| registryJdbc.termRefreshInterval | string | `"2s"` | Used to schedule refresh the ephemeral data/ lock | -| security.authentication.ldap.basedn | string | `"dc=example,dc=com"` | LDAP base dn | -| security.authentication.ldap.password | string | `"password"` | LDAP password | -| security.authentication.ldap.ssl.enable | bool | `false` | LDAP ssl switch | -| security.authentication.ldap.ssl.jksbase64content | string | `""` | LDAP jks file base64 content. If you use macOS, please run `base64 -b 0 -i /path/to/your.jks`. If you use Linux, please run `base64 -w 0 /path/to/your.jks`. If you use Windows, please run `certutil -f -encode /path/to/your.jks`. Then copy the base64 content to below field in one line | -| security.authentication.ldap.ssl.truststore | string | `"/opt/ldapkeystore.jks"` | LDAP jks file absolute path, do not change this value | -| security.authentication.ldap.ssl.truststorepassword | string | `""` | LDAP jks password | -| security.authentication.ldap.urls | string | `"ldap://ldap.forumsys.com:389/"` | LDAP urls | -| security.authentication.ldap.user.admin | string | `"read-only-admin"` | Admin user account when you log-in with LDAP | -| security.authentication.ldap.user.emailattribute | string | `"mail"` | LDAP user email attribute | -| security.authentication.ldap.user.identityattribute | string | `"uid"` | LDAP user identity attribute | -| security.authentication.ldap.user.notexistaction | string | `"CREATE"` | action when ldap user is not exist,default value: CREATE. Optional values include(CREATE,DENY) | -| security.authentication.ldap.username | string | `"cn=read-only-admin,dc=example,dc=com"` | LDAP username | -| security.authentication.type | string | `"PASSWORD"` | Authentication types (supported types: PASSWORD,LDAP,CASDOOR_SSO) | -| timezone | string | `"Asia/Shanghai"` | World time and date for cities in all time zones | -| worker.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | -| worker.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | -| worker.enabled | bool | `true` | Enable or disable the Worker component | -| worker.env.WORKER_EXEC_THREADS | string | `"100"` | Worker execute thread number to limit task instances | -| worker.env.WORKER_HEARTBEAT_INTERVAL | string | `"10s"` | Worker heartbeat interval, the unit is second | -| worker.env.WORKER_HEART_ERROR_THRESHOLD | string | `"5"` | Worker heartbeat error threshold | -| worker.env.WORKER_HOST_WEIGHT | string | `"100"` | Worker host weight to dispatch tasks | -| worker.env.WORKER_MAX_CPU_LOAD_AVG | string | `"1"` | Worker max cpu load avg, only higher than the system cpu load average, worker server can be dispatched tasks | -| worker.env.WORKER_RESERVED_MEMORY | string | `"0.3"` | Worker reserved memory, only lower than system available memory, worker server can be dispatched tasks, the unit is G | -| worker.keda.advanced | object | `{}` | Specify HPA related options | -| worker.keda.cooldownPeriod | int | `30` | How many seconds KEDA will wait before scaling to zero. Note that HPA has a separate cooldown period for scale-downs | -| worker.keda.enabled | bool | `false` | Enable or disable the Keda component | -| worker.keda.maxReplicaCount | int | `3` | Maximum number of workers created by keda | -| worker.keda.minReplicaCount | int | `0` | Minimum number of workers created by keda | -| worker.keda.namespaceLabels | object | `{}` | Keda namespace labels | -| worker.keda.pollingInterval | int | `5` | How often KEDA polls the DolphinScheduler DB to report new scale requests to the HPA | -| worker.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| worker.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | -| worker.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| worker.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | -| worker.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| worker.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| worker.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| worker.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | -| worker.persistentVolumeClaim | object | `{"dataPersistentVolume":{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"},"enabled":false,"logsPersistentVolume":{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. The StatefulSet controller is responsible for mapping network identities to claims in a way that maintains the identity of a pod. Every claim in this list must have at least one matching (by name) volumeMount in one container in the template. A claim in this list takes precedence over any volumes in the template, with the same name. | -| worker.persistentVolumeClaim.dataPersistentVolume.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | -| worker.persistentVolumeClaim.dataPersistentVolume.enabled | bool | `false` | Set `worker.persistentVolumeClaim.dataPersistentVolume.enabled` to `true` to mount a data volume for `worker` | -| worker.persistentVolumeClaim.dataPersistentVolume.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| worker.persistentVolumeClaim.dataPersistentVolume.storageClassName | string | `"-"` | `Worker` data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| worker.persistentVolumeClaim.enabled | bool | `false` | Set `worker.persistentVolumeClaim.enabled` to `true` to enable `persistentVolumeClaim` for `worker` | -| worker.persistentVolumeClaim.logsPersistentVolume.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | -| worker.persistentVolumeClaim.logsPersistentVolume.enabled | bool | `false` | Set `worker.persistentVolumeClaim.logsPersistentVolume.enabled` to `true` to mount a logs volume for `worker` | -| worker.persistentVolumeClaim.logsPersistentVolume.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | -| worker.persistentVolumeClaim.logsPersistentVolume.storageClassName | string | `"-"` | `Worker` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| worker.podManagementPolicy | string | `"Parallel"` | PodManagementPolicy controls how pods are created during initial scale up, when replacing pods on nodes, or when scaling down. | -| worker.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | -| worker.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | -| worker.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | -| worker.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | -| worker.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | -| worker.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | -| worker.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | -| worker.replicas | string | `"3"` | Replicas is the desired number of replicas of the given Template. | -| worker.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | -| worker.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | -| worker.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | -| worker.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | -| worker.service.serviceMonitor.enabled | bool | `false` | Enable or disable worker serviceMonitor | -| worker.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | -| worker.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | -| worker.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | -| worker.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | -| zookeeper.enabled | bool | `true` | If not exists external registry, the zookeeper registry will be used by default. | -| zookeeper.fourlwCommandsWhitelist | string | `"srvr,ruok,wchs,cons"` | A list of comma separated Four Letter Words commands to use | -| zookeeper.persistence.enabled | bool | `false` | Set `zookeeper.persistence.enabled` to true to mount a new volume for internal ZooKeeper | -| zookeeper.persistence.size | string | `"20Gi"` | PersistentVolumeClaim size | -| zookeeper.persistence.storageClass | string | `"-"` | ZooKeeper data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | -| zookeeper.service.port | int | `2181` | The port of zookeeper | +| Key | Type | Default | Description | +|----------------------------------------------------------------------------------------|--------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| alert.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | +| alert.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | +| alert.enabled | bool | `true` | Enable or disable the Alert-Server component | +| alert.env.JAVA_OPTS | string | `"-Xms512m -Xmx512m -Xmn256m"` | The jvm options for alert server | +| alert.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| alert.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | +| alert.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| alert.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | +| alert.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| alert.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| alert.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| alert.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | +| alert.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. More info: [persistentvolumeclaims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) | +| alert.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | +| alert.persistentVolumeClaim.enabled | bool | `false` | Set `alert.persistentVolumeClaim.enabled` to `true` to mount a new volume for `alert` | +| alert.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| alert.persistentVolumeClaim.storageClassName | string | `"-"` | `Alert` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| alert.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| alert.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | +| alert.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| alert.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | +| alert.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| alert.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| alert.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| alert.replicas | int | `1` | Number of desired pods. This is a pointer to distinguish between explicit zero and not specified. Defaults to 1. | +| alert.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | +| alert.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | +| alert.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | +| alert.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | +| alert.service.serviceMonitor.enabled | bool | `false` | Enable or disable alert-server serviceMonitor | +| alert.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | +| alert.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | +| alert.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | +| alert.strategy | object | `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"25%"},"type":"RollingUpdate"}` | The deployment strategy to use to replace existing pods with new ones. | +| alert.strategy.rollingUpdate.maxSurge | string | `"25%"` | The maximum number of pods that can be scheduled above the desired number of pods | +| alert.strategy.rollingUpdate.maxUnavailable | string | `"25%"` | The maximum number of pods that can be unavailable during the update | +| alert.strategy.type | string | `"RollingUpdate"` | Type of deployment. Can be "Recreate" or "RollingUpdate" | +| alert.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | +| api.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | +| api.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | +| api.enabled | bool | `true` | Enable or disable the API-Server component | +| api.env.JAVA_OPTS | string | `"-Xms512m -Xmx512m -Xmn256m"` | The jvm options for api server | +| api.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| api.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | +| api.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| api.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | +| api.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| api.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| api.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| api.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | +| api.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. More info: [persistentvolumeclaims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) | +| api.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | +| api.persistentVolumeClaim.enabled | bool | `false` | Set `api.persistentVolumeClaim.enabled` to `true` to mount a new volume for `api` | +| api.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| api.persistentVolumeClaim.storageClassName | string | `"-"` | `api` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| api.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| api.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | +| api.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| api.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | +| api.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| api.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| api.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| api.replicas | string | `"1"` | Number of desired pods. This is a pointer to distinguish between explicit zero and not specified. Defaults to 1. | +| api.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | +| api.service.annotations | object | `{}` | annotations may need to be set when service.type is LoadBalancer service.beta.kubernetes.io/aws-load-balancer-ssl-cert: arn:aws:acm:us-east-1:EXAMPLE_CERT | +| api.service.clusterIP | string | `""` | clusterIP is the IP address of the service and is usually assigned randomly by the master | +| api.service.externalIPs | list | `[]` | externalIPs is a list of IP addresses for which nodes in the cluster will also accept traffic for this service | +| api.service.externalName | string | `""` | externalName is the external reference that kubedns or equivalent will return as a CNAME record for this service, requires Type to be ExternalName | +| api.service.loadBalancerIP | string | `""` | loadBalancerIP when service.type is LoadBalancer. LoadBalancer will get created with the IP specified in this field | +| api.service.nodePort | string | `""` | nodePort is the port on each node on which this api service is exposed when type=NodePort | +| api.service.pythonNodePort | string | `""` | pythonNodePort is the port on each node on which this python api service is exposed when type=NodePort | +| api.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/dolphinscheduler/actuator/prometheus"}` | serviceMonitor for prometheus operator | +| api.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | +| api.service.serviceMonitor.enabled | bool | `false` | Enable or disable api-server serviceMonitor | +| api.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | +| api.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | +| api.service.serviceMonitor.path | string | `"/dolphinscheduler/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | +| api.service.type | string | `"ClusterIP"` | type determines how the Service is exposed. Defaults to ClusterIP. Valid options are ExternalName, ClusterIP, NodePort, and LoadBalancer | +| api.strategy | object | `{"rollingUpdate":{"maxSurge":"25%","maxUnavailable":"25%"},"type":"RollingUpdate"}` | The deployment strategy to use to replace existing pods with new ones. | +| api.strategy.rollingUpdate.maxSurge | string | `"25%"` | The maximum number of pods that can be scheduled above the desired number of pods | +| api.strategy.rollingUpdate.maxUnavailable | string | `"25%"` | The maximum number of pods that can be unavailable during the update | +| api.strategy.type | string | `"RollingUpdate"` | Type of deployment. Can be "Recreate" or "RollingUpdate" | +| api.taskTypeFilter.enabled | bool | `false` | Enable or disable the task type filter. If set to true, the API-Server will return tasks of a specific type set in api.taskTypeFilter.task Note: This feature only filters tasks to return a specific type on the WebUI. However, you can still create any task that DolphinScheduler supports via the API. | +| api.taskTypeFilter.task | object | `{}` | ref: [task-type-config.yaml](https://github.com/apache/dolphinscheduler/blob/dev/dolphinscheduler-api/src/main/resources/task-type-config.yaml) | +| api.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | +| common.configmap.DATAX_LAUNCHER | string | `"/opt/soft/datax/bin/datax.py"` | Set `DATAX_LAUNCHER` for DolphinScheduler's task environment | +| common.configmap.DATA_BASEDIR_PATH | string | `"/tmp/dolphinscheduler"` | User data directory path, self configuration, please make sure the directory exists and have read write permissions | +| common.configmap.DOLPHINSCHEDULER_OPTS | string | `""` | The jvm options for dolphinscheduler, suitable for all servers | +| common.configmap.FLINK_HOME | string | `"/opt/soft/flink"` | Set `FLINK_HOME` for DolphinScheduler's task environment | +| common.configmap.HADOOP_CONF_DIR | string | `"/opt/soft/hadoop/etc/hadoop"` | Set `HADOOP_CONF_DIR` for DolphinScheduler's task environment | +| common.configmap.HADOOP_HOME | string | `"/opt/soft/hadoop"` | Set `HADOOP_HOME` for DolphinScheduler's task environment | +| common.configmap.HIVE_HOME | string | `"/opt/soft/hive"` | Set `HIVE_HOME` for DolphinScheduler's task environment | +| common.configmap.JAVA_HOME | string | `"/opt/java/openjdk"` | Set `JAVA_HOME` for DolphinScheduler's task environment | +| common.configmap.PYTHON_LAUNCHER | string | `"/usr/bin/python/bin/python3"` | Set `PYTHON_LAUNCHER` for DolphinScheduler's task environment | +| common.configmap.RESOURCE_UPLOAD_PATH | string | `"/dolphinscheduler"` | Resource store on HDFS/S3 path, please make sure the directory exists on hdfs and have read write permissions | +| common.configmap.SPARK_HOME | string | `"/opt/soft/spark"` | Set `SPARK_HOME` for DolphinScheduler's task environment | +| common.fsFileResourcePersistence.accessModes | list | `["ReadWriteMany"]` | `PersistentVolumeClaim` access modes, must be `ReadWriteMany` | +| common.fsFileResourcePersistence.enabled | bool | `false` | Set `common.fsFileResourcePersistence.enabled` to `true` to mount a new file resource volume for `api` and `worker` | +| common.fsFileResourcePersistence.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| common.fsFileResourcePersistence.storageClassName | string | `"-"` | Resource persistent volume storage class, must support the access mode: `ReadWriteMany` | +| common.sharedStoragePersistence.accessModes | list | `["ReadWriteMany"]` | `PersistentVolumeClaim` access modes, must be `ReadWriteMany` | +| common.sharedStoragePersistence.enabled | bool | `false` | Set `common.sharedStoragePersistence.enabled` to `true` to mount a shared storage volume for Hadoop, Spark binary and etc | +| common.sharedStoragePersistence.mountPath | string | `"/opt/soft"` | The mount path for the shared storage volume | +| common.sharedStoragePersistence.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| common.sharedStoragePersistence.storageClassName | string | `"-"` | Shared Storage persistent volume storage class, must support the access mode: ReadWriteMany | +| conf.auto | bool | `false` | auto restart, if true, all components will be restarted automatically after the common configuration is updated. if false, you need to restart the components manually. default is false | +| conf.common."alert.rpc.port" | int | `50052` | rpc port | +| conf.common."appId.collect" | string | `"log"` | way to collect applicationId: log, aop | +| conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh | +| conf.common."data-quality.jar.name" | string | `"dolphinscheduler-data-quality-dev-SNAPSHOT.jar"` | data quality option | +| conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions | +| conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable | +| conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt | +| conf.common."development.state" | bool | `false` | development state | +| conf.common."hadoop.security.authentication.startup.state" | bool | `false` | whether to startup kerberos | +| conf.common."java.security.krb5.conf.path" | string | `"/opt/krb5.conf"` | java.security.krb5.conf path | +| conf.common."kerberos.expire.time" | int | `2` | kerberos expire time, the unit is hour | +| conf.common."login.user.keytab.path" | string | `"/opt/hdfs.headless.keytab"` | login user from keytab path | +| conf.common."login.user.keytab.username" | string | `"hdfs-mycluster@ESZ.COM"` | login user from keytab username | +| conf.common."ml.mlflow.preset_repository" | string | `"https://github.com/apache/dolphinscheduler-mlflow"` | mlflow task plugin preset repository | +| conf.common."ml.mlflow.preset_repository_version" | string | `"main"` | mlflow task plugin preset repository version | +| conf.common."resource.alibaba.cloud.access.key.id" | string | `""` | alibaba cloud access key id, required if you set resource.storage.type=OSS | +| conf.common."resource.alibaba.cloud.access.key.secret" | string | `""` | alibaba cloud access key secret, required if you set resource.storage.type=OSS | +| conf.common."resource.alibaba.cloud.oss.bucket.name" | string | `"dolphinscheduler"` | oss bucket name, required if you set resource.storage.type=OSS | +| conf.common."resource.alibaba.cloud.oss.endpoint" | string | `"https://oss-cn-hangzhou.aliyuncs.com"` | oss bucket endpoint, required if you set resource.storage.type=OSS | +| conf.common."resource.alibaba.cloud.region" | string | `"cn-hangzhou"` | alibaba cloud region, required if you set resource.storage.type=OSS | +| conf.common."resource.aws.access.key.id" | string | `"minioadmin"` | The AWS access key. if resource.storage.type=S3 or use EMR-Task, This configuration is required | +| conf.common."resource.aws.region" | string | `"ca-central-1"` | The AWS Region to use. if resource.storage.type=S3 or use EMR-Task, This configuration is required | +| conf.common."resource.aws.s3.bucket.name" | string | `"dolphinscheduler"` | The name of the bucket. You need to create them by yourself. Otherwise, the system cannot start. All buckets in Amazon S3 share a single namespace; ensure the bucket is given a unique name. | +| conf.common."resource.aws.s3.endpoint" | string | `"http://minio:9000"` | You need to set this parameter when private cloud s3. If S3 uses public cloud, you only need to set resource.aws.region or set to the endpoint of a public cloud such as S3.cn-north-1.amazonaws.com.cn | +| conf.common."resource.aws.secret.access.key" | string | `"minioadmin"` | The AWS secret access key. if resource.storage.type=S3 or use EMR-Task, This configuration is required | +| conf.common."resource.azure.client.id" | string | `"minioadmin"` | azure storage account name, required if you set resource.storage.type=ABS | +| conf.common."resource.azure.client.secret" | string | `"minioadmin"` | azure storage account key, required if you set resource.storage.type=ABS | +| conf.common."resource.azure.subId" | string | `"minioadmin"` | azure storage subId, required if you set resource.storage.type=ABS | +| conf.common."resource.azure.tenant.id" | string | `"minioadmin"` | azure storage tenantId, required if you set resource.storage.type=ABS | +| conf.common."resource.hdfs.fs.defaultFS" | string | `"hdfs://mycluster:8020"` | if resource.storage.type=S3, the value like: s3a://dolphinscheduler; if resource.storage.type=HDFS and namenode HA is enabled, you need to copy core-site.xml and hdfs-site.xml to conf dir | +| conf.common."resource.hdfs.root.user" | string | `"hdfs"` | if resource.storage.type=HDFS, the user must have the permission to create directories under the HDFS root path | +| conf.common."resource.manager.httpaddress.port" | int | `8088` | resourcemanager port, the default value is 8088 if not specified | +| conf.common."resource.storage.type" | string | `"S3"` | resource storage type: HDFS, S3, OSS, GCS, ABS, NONE | +| conf.common."resource.storage.upload.base.path" | string | `"/dolphinscheduler"` | resource store on HDFS/S3 path, resource file will store to this base path, self configuration, please make sure the directory exists on hdfs and have read write permissions. "/dolphinscheduler" is recommended | +| conf.common."sudo.enable" | bool | `true` | use sudo or not, if set true, executing user is tenant user and deploy user needs sudo permissions; if set false, executing user is the deploy user and doesn't need sudo permissions | +| conf.common."support.hive.oneSession" | bool | `false` | Whether hive SQL is executed in the same session | +| conf.common."task.resource.limit.state" | bool | `false` | Task resource limit state | +| conf.common."yarn.application.status.address" | string | `"http://ds1:%s/ws/v1/cluster/apps/%s"` | if resourcemanager HA is enabled or not use resourcemanager, please keep the default value; If resourcemanager is single, you only need to replace ds1 to actual resourcemanager hostname | +| conf.common."yarn.job.history.status.address" | string | `"http://ds1:19888/ws/v1/history/mapreduce/jobs/%s"` | job history status url when application number threshold is reached(default 10000, maybe it was set to 1000) | +| conf.common."yarn.resourcemanager.ha.rm.ids" | string | `"192.168.xx.xx,192.168.xx.xx"` | if resourcemanager HA is enabled, please set the HA IPs; if resourcemanager is single, keep this value empty | +| externalDatabase.database | string | `"dolphinscheduler"` | The database of external database | +| externalDatabase.driverClassName | string | `"org.postgresql.Driver"` | The driverClassName of external database | +| externalDatabase.enabled | bool | `false` | If exists external database, and set postgresql.enable value to false. external database will be used, otherwise Dolphinscheduler's internal database will be used. | +| externalDatabase.host | string | `"localhost"` | The host of external database | +| externalDatabase.params | string | `"characterEncoding=utf8"` | The params of external database | +| externalDatabase.password | string | `"root"` | The password of external database | +| externalDatabase.port | string | `"5432"` | The port of external database | +| externalDatabase.type | string | `"postgresql"` | The type of external database, supported types: postgresql, mysql | +| externalDatabase.username | string | `"root"` | The username of external database | +| externalRegistry.registryPluginName | string | `"zookeeper"` | If exists external registry and set `zookeeper.enable` && `registryEtcd.enabled` && `registryJdbc.enabled` to false, specify the external registry plugin name | +| externalRegistry.registryServers | string | `"127.0.0.1:2181"` | If exists external registry and set `zookeeper.enable` && `registryEtcd.enabled` && `registryJdbc.enabled` to false, specify the external registry servers | +| image.alert | string | `"dolphinscheduler-alert-server"` | alert-server image | +| image.api | string | `"dolphinscheduler-api"` | api-server image | +| image.master | string | `"dolphinscheduler-master"` | master image | +| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. Options: Always, Never, IfNotPresent | +| image.pullSecret | string | `""` | Specify a imagePullSecrets | +| image.registry | string | `"apache/dolphinscheduler"` | Docker image repository for the DolphinScheduler | +| image.tag | string | `"latest"` | Docker image version for the DolphinScheduler | +| image.tools | string | `"dolphinscheduler-tools"` | tools image | +| image.worker | string | `"dolphinscheduler-worker"` | worker image | +| ingress.annotations | object | `{}` | Ingress annotations | +| ingress.enabled | bool | `false` | Enable ingress | +| ingress.host | string | `"dolphinscheduler.org"` | Ingress host | +| ingress.path | string | `"/dolphinscheduler"` | Ingress path | +| ingress.tls.enabled | bool | `false` | Enable ingress tls | +| ingress.tls.secretName | string | `"dolphinscheduler-tls"` | Ingress tls secret name | +| initImage | object | `{"busybox":"busybox:1.30.1","pullPolicy":"IfNotPresent"}` | Used to detect whether dolphinscheduler dependent services such as database are ready | +| initImage.busybox | string | `"busybox:1.30.1"` | Specify initImage repository | +| initImage.pullPolicy | string | `"IfNotPresent"` | Image pull policy. Options: Always, Never, IfNotPresent | +| master.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | +| master.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | +| master.enabled | bool | `true` | Enable or disable the Master component | +| master.env.JAVA_OPTS | string | `"-Xms1g -Xmx1g -Xmn512m"` | The jvm options for master server | +| master.env.MASTER_DISPATCH_TASK_NUM | string | `"3"` | Master dispatch task number per batch | +| master.env.MASTER_EXEC_TASK_NUM | string | `"20"` | Master execute task number in parallel per process instance | +| master.env.MASTER_EXEC_THREADS | string | `"100"` | Master execute thread number to limit process instances | +| master.env.MASTER_FAILOVER_INTERVAL | string | `"10m"` | Master failover interval, the unit is minute | +| master.env.MASTER_HEARTBEAT_ERROR_THRESHOLD | string | `"5"` | Master heartbeat error threshold | +| master.env.MASTER_HEARTBEAT_INTERVAL | string | `"10s"` | Master heartbeat interval, the unit is second | +| master.env.MASTER_HOST_SELECTOR | string | `"LowerWeight"` | Master host selector to select a suitable worker, optional values include Random, RoundRobin, LowerWeight | +| master.env.MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER | string | `"true"` | Master kill application when handle failover | +| master.env.MASTER_SERVER_LOAD_PROTECTION_ENABLED | bool | `true` | | +| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. | +| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. | +| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. | +| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. | +| master.env.MASTER_STATE_WHEEL_INTERVAL | string | `"5s"` | master state wheel interval, the unit is second | +| master.env.MASTER_TASK_COMMIT_INTERVAL | string | `"1s"` | master commit task interval, the unit is second | +| master.env.MASTER_TASK_COMMIT_RETRYTIMES | string | `"5"` | Master commit task retry times | +| master.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| master.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | +| master.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| master.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | +| master.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| master.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| master.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| master.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | +| master.persistentVolumeClaim | object | `{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. The StatefulSet controller is responsible for mapping network identities to claims in a way that maintains the identity of a pod. Every claim in this list must have at least one matching (by name) volumeMount in one container in the template. A claim in this list takes precedence over any volumes in the template, with the same name. | +| master.persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | +| master.persistentVolumeClaim.enabled | bool | `false` | Set `master.persistentVolumeClaim.enabled` to `true` to mount a new volume for `master` | +| master.persistentVolumeClaim.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| master.persistentVolumeClaim.storageClassName | string | `"-"` | `Master` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| master.podManagementPolicy | string | `"Parallel"` | PodManagementPolicy controls how pods are created during initial scale up, when replacing pods on nodes, or when scaling down. | +| master.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| master.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | +| master.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| master.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | +| master.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| master.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| master.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| master.replicas | string | `"3"` | Replicas is the desired number of replicas of the given Template. | +| master.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | +| master.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | +| master.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | +| master.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | +| master.service.serviceMonitor.enabled | bool | `false` | Enable or disable master serviceMonitor | +| master.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | +| master.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | +| master.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | +| master.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | +| minio.auth.rootPassword | string | `"minioadmin"` | minio password | +| minio.auth.rootUser | string | `"minioadmin"` | minio username | +| minio.defaultBuckets | string | `"dolphinscheduler"` | minio default buckets | +| minio.enabled | bool | `true` | Deploy minio and configure it as the default storage for DolphinScheduler, note this is for demo only, not for production. | +| minio.persistence.enabled | bool | `false` | Set minio.persistence.enabled to true to mount a new volume for internal minio | +| mysql.auth.database | string | `"dolphinscheduler"` | mysql database | +| mysql.auth.params | string | `"characterEncoding=utf8"` | mysql params | +| mysql.auth.password | string | `"ds"` | mysql password | +| mysql.auth.username | string | `"ds"` | mysql username | +| mysql.driverClassName | string | `"com.mysql.cj.jdbc.Driver"` | mysql driverClassName | +| mysql.enabled | bool | `false` | If not exists external MySQL, by default, the DolphinScheduler will use a internal MySQL | +| mysql.primary.persistence.enabled | bool | `false` | Set mysql.primary.persistence.enabled to true to mount a new volume for internal MySQL | +| mysql.primary.persistence.size | string | `"20Gi"` | `PersistentVolumeClaim` size | +| mysql.primary.persistence.storageClass | string | `"-"` | MySQL data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| postgresql.driverClassName | string | `"org.postgresql.Driver"` | The driverClassName for internal PostgreSQL | +| postgresql.enabled | bool | `true` | If not exists external PostgreSQL, by default, the DolphinScheduler will use a internal PostgreSQL | +| postgresql.params | string | `"characterEncoding=utf8"` | The params for internal PostgreSQL | +| postgresql.persistence.enabled | bool | `false` | Set postgresql.persistence.enabled to true to mount a new volume for internal PostgreSQL | +| postgresql.persistence.size | string | `"20Gi"` | `PersistentVolumeClaim` size | +| postgresql.persistence.storageClass | string | `"-"` | PostgreSQL data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| postgresql.postgresqlDatabase | string | `"dolphinscheduler"` | The database for internal PostgreSQL | +| postgresql.postgresqlPassword | string | `"root"` | The password for internal PostgreSQL | +| postgresql.postgresqlUsername | string | `"root"` | The username for internal PostgreSQL | +| registryEtcd.authority | string | `""` | Etcd authority | +| registryEtcd.enabled | bool | `false` | If you want to use Etcd for your registry center, change this value to true. And set zookeeper.enabled to false | +| registryEtcd.endpoints | string | `""` | Etcd endpoints | +| registryEtcd.namespace | string | `"dolphinscheduler"` | Etcd namespace | +| registryEtcd.passWord | string | `""` | Etcd passWord | +| registryEtcd.ssl.certFile | string | `"etcd-certs/ca.crt"` | CertFile file path | +| registryEtcd.ssl.enabled | bool | `false` | If your Etcd server has configured with ssl, change this value to true. About certification files you can see [here](https://github.com/etcd-io/jetcd/blob/main/docs/SslConfig.md) for how to convert. | +| registryEtcd.ssl.keyCertChainFile | string | `"etcd-certs/client.crt"` | keyCertChainFile file path | +| registryEtcd.ssl.keyFile | string | `"etcd-certs/client.pem"` | keyFile file path | +| registryEtcd.user | string | `""` | Etcd user | +| registryJdbc.enabled | bool | `false` | If you want to use JDbc for your registry center, change this value to true. And set zookeeper.enabled and registryEtcd.enabled to false | +| registryJdbc.hikariConfig.driverClassName | string | `"com.mysql.cj.jdbc.Driver"` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | +| registryJdbc.hikariConfig.enabled | bool | `false` | Default use same Dolphinscheduler's database, if you want to use other database please change `enabled` to `true` and change other configs | +| registryJdbc.hikariConfig.jdbcurl | string | `"jdbc:mysql://"` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | +| registryJdbc.hikariConfig.password | string | `""` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | +| registryJdbc.hikariConfig.username | string | `""` | Default use same Dolphinscheduler's database if you don't change this value. If you set this value, Registry jdbc's database type will use it | +| registryJdbc.termExpireTimes | int | `3` | Used to calculate the expire time | +| registryJdbc.termRefreshInterval | string | `"2s"` | Used to schedule refresh the ephemeral data/ lock | +| security.authentication.ldap.basedn | string | `"dc=example,dc=com"` | LDAP base dn | +| security.authentication.ldap.password | string | `"password"` | LDAP password | +| security.authentication.ldap.ssl.enable | bool | `false` | LDAP ssl switch | +| security.authentication.ldap.ssl.jksbase64content | string | `""` | LDAP jks file base64 content. If you use macOS, please run `base64 -b 0 -i /path/to/your.jks`. If you use Linux, please run `base64 -w 0 /path/to/your.jks`. If you use Windows, please run `certutil -f -encode /path/to/your.jks`. Then copy the base64 content to below field in one line | +| security.authentication.ldap.ssl.truststore | string | `"/opt/ldapkeystore.jks"` | LDAP jks file absolute path, do not change this value | +| security.authentication.ldap.ssl.truststorepassword | string | `""` | LDAP jks password | +| security.authentication.ldap.urls | string | `"ldap://ldap.forumsys.com:389/"` | LDAP urls | +| security.authentication.ldap.user.admin | string | `"read-only-admin"` | Admin user account when you log-in with LDAP | +| security.authentication.ldap.user.emailattribute | string | `"mail"` | LDAP user email attribute | +| security.authentication.ldap.user.identityattribute | string | `"uid"` | LDAP user identity attribute | +| security.authentication.ldap.user.notexistaction | string | `"CREATE"` | action when ldap user is not exist,default value: CREATE. Optional values include(CREATE,DENY) | +| security.authentication.ldap.username | string | `"cn=read-only-admin,dc=example,dc=com"` | LDAP username | +| security.authentication.type | string | `"PASSWORD"` | Authentication types (supported types: PASSWORD,LDAP,CASDOOR_SSO) | +| timezone | string | `"Asia/Shanghai"` | World time and date for cities in all time zones | +| worker.affinity | object | `{}` | Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. More info: [node-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity) | +| worker.annotations | object | `{}` | You can use annotations to attach arbitrary non-identifying metadata to objects. Clients such as tools and libraries can retrieve this metadata. | +| worker.enabled | bool | `true` | Enable or disable the Worker component | +| worker.env.WORKER_EXEC_THREADS | string | `"100"` | Worker execute thread number to limit task instances | +| worker.env.WORKER_HEARTBEAT_INTERVAL | string | `"10s"` | Worker heartbeat interval, the unit is second | +| worker.env.WORKER_HEART_ERROR_THRESHOLD | string | `"5"` | Worker heartbeat error threshold | +| worker.env.WORKER_HOST_WEIGHT | string | `"100"` | Worker host weight to dispatch tasks | +| worker.env.WORKER_SERVER_LOAD_PROTECTION_ENABLED | true | `true` | | +| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks. | +| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks. | +| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max memory usage , when the worker's memory usage is smaller then this value, worker server can be dispatched tasks. | +| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks. | +| worker.keda.advanced | object | `{}` | Specify HPA related options | +| worker.keda.cooldownPeriod | int | `30` | How many seconds KEDA will wait before scaling to zero. Note that HPA has a separate cooldown period for scale-downs | +| worker.keda.enabled | bool | `false` | Enable or disable the Keda component | +| worker.keda.maxReplicaCount | int | `3` | Maximum number of workers created by keda | +| worker.keda.minReplicaCount | int | `0` | Minimum number of workers created by keda | +| worker.keda.namespaceLabels | object | `{}` | Keda namespace labels | +| worker.keda.pollingInterval | int | `5` | How often KEDA polls the DolphinScheduler DB to report new scale requests to the HPA | +| worker.livenessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container liveness. Container will be restarted if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| worker.livenessProbe.enabled | bool | `true` | Turn on and off liveness probe | +| worker.livenessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| worker.livenessProbe.initialDelaySeconds | string | `"30"` | Delay before liveness probe is initiated | +| worker.livenessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| worker.livenessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| worker.livenessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| worker.nodeSelector | object | `{}` | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: [assign-pod-node](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) | +| worker.persistentVolumeClaim | object | `{"dataPersistentVolume":{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"},"enabled":false,"logsPersistentVolume":{"accessModes":["ReadWriteOnce"],"enabled":false,"storage":"20Gi","storageClassName":"-"}}` | PersistentVolumeClaim represents a reference to a PersistentVolumeClaim in the same namespace. The StatefulSet controller is responsible for mapping network identities to claims in a way that maintains the identity of a pod. Every claim in this list must have at least one matching (by name) volumeMount in one container in the template. A claim in this list takes precedence over any volumes in the template, with the same name. | +| worker.persistentVolumeClaim.dataPersistentVolume.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | +| worker.persistentVolumeClaim.dataPersistentVolume.enabled | bool | `false` | Set `worker.persistentVolumeClaim.dataPersistentVolume.enabled` to `true` to mount a data volume for `worker` | +| worker.persistentVolumeClaim.dataPersistentVolume.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| worker.persistentVolumeClaim.dataPersistentVolume.storageClassName | string | `"-"` | `Worker` data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| worker.persistentVolumeClaim.enabled | bool | `false` | Set `worker.persistentVolumeClaim.enabled` to `true` to enable `persistentVolumeClaim` for `worker` | +| worker.persistentVolumeClaim.logsPersistentVolume.accessModes | list | `["ReadWriteOnce"]` | `PersistentVolumeClaim` access modes | +| worker.persistentVolumeClaim.logsPersistentVolume.enabled | bool | `false` | Set `worker.persistentVolumeClaim.logsPersistentVolume.enabled` to `true` to mount a logs volume for `worker` | +| worker.persistentVolumeClaim.logsPersistentVolume.storage | string | `"20Gi"` | `PersistentVolumeClaim` size | +| worker.persistentVolumeClaim.logsPersistentVolume.storageClassName | string | `"-"` | `Worker` logs data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| worker.podManagementPolicy | string | `"Parallel"` | PodManagementPolicy controls how pods are created during initial scale up, when replacing pods on nodes, or when scaling down. | +| worker.readinessProbe | object | `{"enabled":true,"failureThreshold":"3","initialDelaySeconds":"30","periodSeconds":"30","successThreshold":"1","timeoutSeconds":"5"}` | Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. More info: [container-probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes) | +| worker.readinessProbe.enabled | bool | `true` | Turn on and off readiness probe | +| worker.readinessProbe.failureThreshold | string | `"3"` | Minimum consecutive failures for the probe | +| worker.readinessProbe.initialDelaySeconds | string | `"30"` | Delay before readiness probe is initiated | +| worker.readinessProbe.periodSeconds | string | `"30"` | How often to perform the probe | +| worker.readinessProbe.successThreshold | string | `"1"` | Minimum consecutive successes for the probe | +| worker.readinessProbe.timeoutSeconds | string | `"5"` | When the probe times out | +| worker.replicas | string | `"3"` | Replicas is the desired number of replicas of the given Template. | +| worker.resources | object | `{}` | Compute Resources required by this container. More info: [manage-resources-containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) | +| worker.service.annotations | object | `{}` | annotations may need to be set when want to scrapy metrics by prometheus but not install prometheus operator | +| worker.service.serviceMonitor | object | `{"annotations":{},"enabled":false,"interval":"15s","labels":{},"path":"/actuator/prometheus"}` | serviceMonitor for prometheus operator | +| worker.service.serviceMonitor.annotations | object | `{}` | serviceMonitor.annotations ServiceMonitor annotations | +| worker.service.serviceMonitor.enabled | bool | `false` | Enable or disable worker serviceMonitor | +| worker.service.serviceMonitor.interval | string | `"15s"` | serviceMonitor.interval interval at which metrics should be scraped | +| worker.service.serviceMonitor.labels | object | `{}` | serviceMonitor.labels ServiceMonitor extra labels | +| worker.service.serviceMonitor.path | string | `"/actuator/prometheus"` | serviceMonitor.path path of the metrics endpoint | +| worker.tolerations | list | `[]` | Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. | +| zookeeper.enabled | bool | `true` | If not exists external registry, the zookeeper registry will be used by default. | +| zookeeper.fourlwCommandsWhitelist | string | `"srvr,ruok,wchs,cons"` | A list of comma separated Four Letter Words commands to use | +| zookeeper.persistence.enabled | bool | `false` | Set `zookeeper.persistence.enabled` to true to mount a new volume for internal ZooKeeper | +| zookeeper.persistence.size | string | `"20Gi"` | PersistentVolumeClaim size | +| zookeeper.persistence.storageClass | string | `"-"` | ZooKeeper data persistent volume storage class. If set to "-", storageClassName: "", which disables dynamic provisioning | +| zookeeper.service.port | int | `2181` | The port of zookeeper | diff --git a/deploy/kubernetes/dolphinscheduler/values.yaml b/deploy/kubernetes/dolphinscheduler/values.yaml index cea0e616db0c9..844db3cab762a 100644 --- a/deploy/kubernetes/dolphinscheduler/values.yaml +++ b/deploy/kubernetes/dolphinscheduler/values.yaml @@ -506,10 +506,15 @@ master: MASTER_TASK_COMMIT_INTERVAL: "1s" # -- master state wheel interval, the unit is second MASTER_STATE_WHEEL_INTERVAL: "5s" - # -- Master max cpuload avg, only higher than the system cpu load average, master server can schedule - MASTER_MAX_CPU_LOAD_AVG: "1" - # -- Master reserved memory, only lower than system available memory, master server can schedule, the unit is G - MASTER_RESERVED_MEMORY: "0.3" + MASTER_SERVER_LOAD_PROTECTION_ENABLED: true + # Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. + MASTER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. + MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. + MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. + MASTER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS: 0.7 # -- Master failover interval, the unit is minute MASTER_FAILOVER_INTERVAL: "10m" # -- Master kill application when handle failover @@ -621,11 +626,15 @@ worker: # -- `PersistentVolumeClaim` size storage: "20Gi" env: - # -- Worker max cpu load avg, only higher than the system cpu load average, worker server can be dispatched tasks - WORKER_MAX_CPU_LOAD_AVG: "1" - # -- Worker reserved memory, only lower than system available memory, worker server can be dispatched tasks, the unit is G - WORKER_RESERVED_MEMORY: "0.3" - # -- Worker execute thread number to limit task instances + WORKER_SERVER_LOAD_PROTECTION_ENABLED: true + # Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks. + WORKER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks. + WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Worker max memory usage , when the worker's memory usage is smaller then this value, worker server can be dispatched tasks. + WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7 + # Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks. + WORKER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS: 0.7 WORKER_EXEC_THREADS: "100" # -- Worker heartbeat interval, the unit is second WORKER_HEARTBEAT_INTERVAL: "10s" diff --git a/docs/docs/en/architecture/configuration.md b/docs/docs/en/architecture/configuration.md index 01f925055fdb3..903219a011684 100644 --- a/docs/docs/en/architecture/configuration.md +++ b/docs/docs/en/architecture/configuration.md @@ -279,46 +279,50 @@ Location: `api-server/conf/application.yaml` Location: `master-server/conf/application.yaml` -| Parameters | Default value | Description | -|------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| master.listen-port | 5678 | master listen port | -| master.fetch-command-num | 10 | the number of commands fetched by master | -| master.pre-exec-threads | 10 | master prepare execute thread number to limit handle commands in parallel | -| master.exec-threads | 100 | master execute thread number to limit process instances in parallel | -| master.dispatch-task-number | 3 | master dispatch task number per batch | -| master.host-selector | lower_weight | master host selector to select a suitable worker, default value: LowerWeight. Optional values include random, round_robin, lower_weight | -| master.heartbeat-interval | 10 | master heartbeat interval, the unit is second | -| master.task-commit-retry-times | 5 | master commit task retry times | -| master.task-commit-interval | 1000 | master commit task interval, the unit is millisecond | -| master.state-wheel-interval | 5 | time to check status | -| master.max-cpu-load-avg | 1 | master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu | -| master.reserved-memory | 0.3 | master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule. | -| master.failover-interval | 10 | failover interval, the unit is minute | -| master.kill-application-when-task-failover | true | whether to kill yarn/k8s application when failover taskInstance | -| master.registry-disconnect-strategy.strategy | stop | Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting | -| master.registry-disconnect-strategy.max-waiting-time | 100s | Used when the master disconnect from registry, and the disconnect strategy is waiting, this config means the master will waiting to reconnect to registry in given times, and after the waiting times, if the master still cannot connect to registry, will stop itself, if the value is 0s, the Master will wait infinitely | -| master.worker-group-refresh-interval | 10s | The interval to refresh worker group from db to memory | +| Parameters | Default value | Description | +|-----------------------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| master.listen-port | 5678 | master listen port | +| master.fetch-command-num | 10 | the number of commands fetched by master | +| master.pre-exec-threads | 10 | master prepare execute thread number to limit handle commands in parallel | +| master.exec-threads | 100 | master execute thread number to limit process instances in parallel | +| master.dispatch-task-number | 3 | master dispatch task number per batch | +| master.host-selector | lower_weight | master host selector to select a suitable worker, default value: LowerWeight. Optional values include random, round_robin, lower_weight | +| master.heartbeat-interval | 10 | master heartbeat interval, the unit is second | +| master.task-commit-retry-times | 5 | master commit task retry times | +| master.task-commit-interval | 1000 | master commit task interval, the unit is millisecond | +| master.state-wheel-interval | 5 | time to check status | +| master.server-load-protection.enabled | true | If set true, will open master overload protection | +| master.server-load-protection.max-cpu-usage-percentage-thresholds | 0.7 | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. | +| master.server-load-protection.max-jvm-memory-usage-percentage-thresholds | 0.7 | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. | +| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. | +| master.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. | +| master.failover-interval | 10 | failover interval, the unit is minute | +| master.kill-application-when-task-failover | true | whether to kill yarn/k8s application when failover taskInstance | +| master.registry-disconnect-strategy.strategy | stop | Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting | +| master.registry-disconnect-strategy.max-waiting-time | 100s | Used when the master disconnect from registry, and the disconnect strategy is waiting, this config means the master will waiting to reconnect to registry in given times, and after the waiting times, if the master still cannot connect to registry, will stop itself, if the value is 0s, the Master will wait infinitely | +| master.worker-group-refresh-interval | 10s | The interval to refresh worker group from db to memory | ### Worker Server related configuration Location: `worker-server/conf/application.yaml` -| Parameters | Default value | Description | -|------------------------------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| worker.listen-port | 1234 | worker-service listen port | -| worker.exec-threads | 100 | worker-service execute thread number, used to limit the number of task instances in parallel | -| worker.heartbeat-interval | 10 | worker-service heartbeat interval, the unit is second | -| worker.host-weight | 100 | worker host weight to dispatch tasks | -| worker.max-cpu-load-avg | 1 | worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu. | -| worker.reserved-memory | 0.3 | worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task. | -| worker.alert-listen-host | localhost | the alert listen host of worker | -| worker.alert-listen-port | 50052 | the alert listen port of worker | -| worker.registry-disconnect-strategy.strategy | stop | Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting | -| worker.registry-disconnect-strategy.max-waiting-time | 100s | Used when the worker disconnect from registry, and the disconnect strategy is waiting, this config means the worker will waiting to reconnect to registry in given times, and after the waiting times, if the worker still cannot connect to registry, will stop itself, if the value is 0s, will wait infinitely | -| worker.task-execute-threads-full-policy | REJECT | If REJECT, when the task waiting in the worker reaches exec-threads, it will reject the received task and the Master will redispatch it; If CONTINUE, it will put the task into the worker's execution queue and wait for a free thread to start execution | -| worker.tenant-config.auto-create-tenant-enabled | true | tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true. | -| worker.tenant-config.distributed-tenant-enabled | false | When this parameter is true, auto-create-tenant-enabled has no effect and will not automatically create tenants | -| worker.tenant-config.default-tenant-enabled | false | If set true, will use worker bootstrap user as the tenant to execute task when the tenant is `default`. | +| Parameters | Default value | Description | +|--------------------------------------------------------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| worker.listen-port | 1234 | worker-service listen port | +| worker.exec-threads | 100 | worker-service execute thread number, used to limit the number of task instances in parallel | +| worker.heartbeat-interval | 10 | worker-service heartbeat interval, the unit is second | +| worker.host-weight | 100 | worker host weight to dispatch tasks | +| worker.server-load-protection.enabled | true | If set true will open worker overload protection | +| worker.max-cpu-usage-percentage-thresholds.max-cpu-usage-percentage-thresholds | 0.7 | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. | +| worker.server-load-protection.max-jvm-memory-usage-percentage-thresholds | 0.7 | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. | +| worker.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. | +| worker.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. | +| worker.registry-disconnect-strategy.strategy | stop | Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting | +| worker.registry-disconnect-strategy.max-waiting-time | 100s | Used when the worker disconnect from registry, and the disconnect strategy is waiting, this config means the worker will waiting to reconnect to registry in given times, and after the waiting times, if the worker still cannot connect to registry, will stop itself, if the value is 0s, will wait infinitely | +| worker.task-execute-threads-full-policy | REJECT | If REJECT, when the task waiting in the worker reaches exec-threads, it will reject the received task and the Master will redispatch it; If CONTINUE, it will put the task into the worker's execution queue and wait for a free thread to start execution | +| worker.tenant-config.auto-create-tenant-enabled | true | tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true. | +| worker.tenant-config.distributed-tenant-enabled | false | When this parameter is true, auto-create-tenant-enabled has no effect and will not automatically create tenants | +| worker.tenant-config.default-tenant-enabled | false | If set true, will use worker bootstrap user as the tenant to execute task when the tenant is `default`. | ### Alert Server related configuration diff --git a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java index 150b8dbe0cee5..794cb49e24853 100644 --- a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java +++ b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java @@ -23,6 +23,8 @@ import org.apache.dolphinscheduler.common.utils.JSONUtils; import org.apache.dolphinscheduler.common.utils.NetUtils; import org.apache.dolphinscheduler.common.utils.OSUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.registry.api.enums.RegistryNodeType; @@ -37,14 +39,18 @@ public class AlertHeartbeatTask extends BaseHeartBeatTask private final AlertConfig alertConfig; private final Integer processId; private final RegistryClient registryClient; + + private final MetricsProvider metricsProvider; private final String heartBeatPath; private final long startupTime; public AlertHeartbeatTask(AlertConfig alertConfig, + MetricsProvider metricsProvider, RegistryClient registryClient) { super("AlertHeartbeatTask", alertConfig.getHeartbeatInterval().toMillis()); this.startupTime = System.currentTimeMillis(); this.alertConfig = alertConfig; + this.metricsProvider = metricsProvider; this.registryClient = registryClient; this.heartBeatPath = RegistryNodeType.ALERT_SERVER.getRegistryPath() + "/" + alertConfig.getAlertServerAddress(); @@ -53,13 +59,14 @@ public AlertHeartbeatTask(AlertConfig alertConfig, @Override public AlertServerHeartBeat getHeartBeat() { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); return AlertServerHeartBeat.builder() .processId(processId) .startupTime(startupTime) .reportTime(System.currentTimeMillis()) - .cpuUsage(OSUtils.cpuUsagePercentage()) - .memoryUsage(OSUtils.memoryUsagePercentage()) - .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) + .cpuUsage(systemMetrics.getTotalCpuUsedPercentage()) + .memoryUsage(systemMetrics.getSystemMemoryUsedPercentage()) + .jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage()) .host(NetUtils.getHost()) .port(alertConfig.getPort()) .build(); diff --git a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertRegistryClient.java b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertRegistryClient.java index a93742e54b4e9..cdb5e5eca8943 100644 --- a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertRegistryClient.java +++ b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertRegistryClient.java @@ -18,6 +18,7 @@ package org.apache.dolphinscheduler.alert.registry; import org.apache.dolphinscheduler.alert.config.AlertConfig; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.registry.api.enums.RegistryNodeType; @@ -36,12 +37,15 @@ public class AlertRegistryClient implements AutoCloseable { @Autowired private AlertConfig alertConfig; + @Autowired + private MetricsProvider metricsProvider; + private AlertHeartbeatTask alertHeartbeatTask; public void start() { log.info("AlertRegistryClient starting..."); registryClient.getLock(RegistryNodeType.ALERT_LOCK.getRegistryPath()); - alertHeartbeatTask = new AlertHeartbeatTask(alertConfig, registryClient); + alertHeartbeatTask = new AlertHeartbeatTask(alertConfig, metricsProvider, registryClient); alertHeartbeatTask.start(); // start heartbeat task log.info("AlertRegistryClient started..."); diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/enums/ServerStatus.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/enums/ServerStatus.java index 16a0f0e34c8f7..1e4f49721a5e0 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/enums/ServerStatus.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/enums/ServerStatus.java @@ -19,6 +19,7 @@ public enum ServerStatus { - NORMAL, ABNORMAL, BUSY + NORMAL, + BUSY } diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java index 9273b77a1c78d..94fff3d733890 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java @@ -33,7 +33,7 @@ public class AlertServerHeartBeat implements HeartBeat { private long reportTime; private double cpuUsage; private double memoryUsage; - private double availablePhysicalMemorySize; + private double jvmMemoryUsage; private String host; private int port; diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java index 6386f32ae6582..1fafb117de313 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java @@ -33,12 +33,11 @@ public class MasterHeartBeat implements HeartBeat { private long startupTime; private long reportTime; private double cpuUsage; + private double jvmMemoryUsage; private double memoryUsage; - private double availablePhysicalMemorySize; - private double reservedMemory; - private double diskAvailable; - private int processId; + private double diskUsage; private ServerStatus serverStatus; + private int processId; private String host; private int port; diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java index 396f227f472c6..40c852944944c 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java @@ -33,11 +33,9 @@ public class WorkerHeartBeat implements HeartBeat { private long startupTime; private long reportTime; private double cpuUsage; + private double jvmMemoryUsage; private double memoryUsage; - private double loadAverage; - private double availablePhysicalMemorySize; - private double reservedMemory; - private double diskAvailable; + private double diskUsage; private ServerStatus serverStatus; private int processId; @@ -45,7 +43,6 @@ public class WorkerHeartBeat implements HeartBeat { private int port; private int workerHostWeight; // worker host weight - private int workerWaitingTaskCount; // worker waiting task count - private int workerExecThreadCount; // worker thread pool thread count + private int threadPoolUsage; // worker waiting task count } diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java index 22563efe4650c..e7acc8b4d1ed3 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java @@ -21,23 +21,17 @@ import org.apache.dolphinscheduler.common.shell.ShellExecutor; import oshi.SystemInfo; -import oshi.hardware.CentralProcessor; -import oshi.hardware.GlobalMemory; import oshi.hardware.HardwareAbstractionLayer; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.SystemUtils; import java.io.BufferedReader; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.lang.management.ManagementFactory; -import java.lang.management.OperatingSystemMXBean; import java.lang.management.RuntimeMXBean; -import java.math.RoundingMode; -import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -45,32 +39,18 @@ import java.util.StringTokenizer; import java.util.regex.Pattern; +import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -/** - * os utils - */ +// todo: Split to WindowsOSUtils/LinuxOSUtils/MacOSOSUtils/K8sOSUtils... @Slf4j +@UtilityClass public class OSUtils { private static final SystemInfo SI = new SystemInfo(); public static final String TWO_DECIMAL = "0.00"; - /** - * return -1 when the function can not get hardware env info - * e.g {@link OSUtils#cpuUsagePercentage()} - */ - public static final double NEGATIVE_ONE = -1; - private static final HardwareAbstractionLayer hal = SI.getHardware(); - private static long[] prevTicks = new long[CentralProcessor.TickType.values().length]; - private static long prevTickTime = 0L; - private static volatile double cpuUsage = 0.0D; - private static final double TOTAL_MEMORY = hal.getMemory().getTotal() / 1024.0 / 1024 / 1024; - - private OSUtils() { - throw new UnsupportedOperationException("Construct OSUtils"); - } /** * Initialization regularization, solve the problem of pre-compilation performance, @@ -78,79 +58,12 @@ private OSUtils() { */ private static final Pattern PATTERN = Pattern.compile("\\s+"); - /** - * get disk usage - * Keep 2 decimal - * - * @return disk free size, unit: GB - */ - public static double diskAvailable() { - File file = new File("."); - long freeSpace = file.getFreeSpace(); // unallocated / free disk space in bytes. - - double diskAvailable = freeSpace / 1024.0 / 1024 / 1024; - - DecimalFormat df = new DecimalFormat(TWO_DECIMAL); - df.setRoundingMode(RoundingMode.HALF_UP); - return Double.parseDouble(df.format(diskAvailable)); + public static double getTotalSystemMemory() { + return hal.getMemory().getTotal(); } - /** - * get available physical or pod memory size - *

- * Keep 2 decimal - * - * @return Available physical or pod memory size, unit: G - */ - public static double availablePhysicalMemorySize() { - double availablePhysicalMemorySize; - - if (KubernetesUtils.isKubernetesMode()) { - long freeMemory = Runtime.getRuntime().freeMemory(); - availablePhysicalMemorySize = freeMemory / 1024.0 / 1024 / 1024; - } else { - GlobalMemory memory = hal.getMemory(); - availablePhysicalMemorySize = memory.getAvailable() / 1024.0 / 1024 / 1024; - } - DecimalFormat df = new DecimalFormat(TWO_DECIMAL); - df.setRoundingMode(RoundingMode.HALF_UP); - return Double.parseDouble(df.format(availablePhysicalMemorySize)); - } - - /** - * get cpu usage - * - * @return cpu usage - */ - public static double cpuUsagePercentage() { - CentralProcessor processor = hal.getProcessor(); - - // Check if > ~ 0.95 seconds since last tick count. - long now = System.currentTimeMillis(); - if (now - prevTickTime > 950) { - // Enough time has elapsed. - if (KubernetesUtils.isKubernetesMode()) { - OperatingSystemMXBean operatingSystemMXBean = ManagementFactory.getOperatingSystemMXBean(); - cpuUsage = operatingSystemMXBean.getSystemLoadAverage(); - } else { - cpuUsage = processor.getSystemCpuLoadBetweenTicks(prevTicks); - } - - prevTickTime = System.currentTimeMillis(); - prevTicks = processor.getSystemCpuLoadTicks(); - } - - if (Double.isNaN(cpuUsage)) { - return NEGATIVE_ONE; - } - - DecimalFormat df = new DecimalFormat(TWO_DECIMAL); - df.setRoundingMode(RoundingMode.HALF_UP); - return Double.parseDouble(df.format(cpuUsage)); - } - - public static double memoryUsagePercentage() { - return (TOTAL_MEMORY - availablePhysicalMemorySize()) / TOTAL_MEMORY; + public static double getSystemMemoryUsed() { + return hal.getMemory().getTotal() - hal.getMemory().getAvailable(); } public static List getUserList() { @@ -433,43 +346,11 @@ public static String exeShell(String[] command) throws IOException { return ShellExecutor.execCommand(command); } - /** - * get process id - * - * @return process id - */ public static int getProcessID() { RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean(); return Integer.parseInt(runtimeMXBean.getName().split("@")[0]); } - /** - * Check memory and cpu usage is overload the given thredshod. - * - * @param maxCpuLoadAvgThreshold maxCpuLoadAvg - * @param reservedMemoryThreshold reservedMemory - * @return True, if the cpu or memory exceed the given thredshod. - */ - public static Boolean isOverload(double maxCpuLoadAvgThreshold, double reservedMemoryThreshold) { - // system load average - double freeCPUPercentage = 1 - cpuUsagePercentage(); - // system available physical memory - double freeMemoryPercentage = 1 - memoryUsagePercentage(); - if (freeCPUPercentage > maxCpuLoadAvgThreshold) { - log.warn("Current cpu load average {} is too high, max.cpuLoad.avg={}", freeCPUPercentage, - maxCpuLoadAvgThreshold); - return true; - } - - if (freeMemoryPercentage < reservedMemoryThreshold) { - log.warn( - "Current available memory percentage{} is too low, reserved.memory={}", freeMemoryPercentage, - reservedMemoryThreshold); - return true; - } - return false; - } - public static Boolean isWindows() { return System.getProperty("os.name").startsWith("Windows"); } diff --git a/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java b/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/utils/OSUtilsTest.java similarity index 66% rename from dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java rename to dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/utils/OSUtilsTest.java index 4c37e99dc1cd4..97aa93743e083 100644 --- a/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java +++ b/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/utils/OSUtilsTest.java @@ -14,48 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.dolphinscheduler.common.os; -import org.apache.dolphinscheduler.common.utils.OSUtils; +package org.apache.dolphinscheduler.common.utils; import org.apache.commons.lang3.SystemUtils; import java.util.List; +import lombok.extern.slf4j.Slf4j; + import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -/** - * OSUtilsTest - */ +@Slf4j public class OSUtilsTest { - private static Logger logger = LoggerFactory.getLogger(OSUtilsTest.class); - - @Test - public void diskAvailable() { - double diskAvailable = OSUtils.diskAvailable(); - logger.info("diskAvailable : {}", diskAvailable); - Assertions.assertTrue(diskAvailable >= 0.0); - } - - @Test - public void cpuUsage() { - double cpuUsage = OSUtils.cpuUsagePercentage(); - logger.info("cpuUsage : {}", cpuUsage); - Assertions.assertTrue(cpuUsage >= 0.0); - } - - @Test - public void availablePhysicalMemorySize() { - double physicalMemorySize = OSUtils.availablePhysicalMemorySize(); - logger.info("physicalMemorySize : {}", physicalMemorySize); - Assertions.assertTrue(physicalMemorySize >= 0.0); - - } - @Test public void existTenantCodeInLinux() { if (SystemUtils.IS_OS_LINUX) { @@ -77,9 +50,23 @@ public void existOSTenandCode() { Assertions.assertFalse(userList.contains("xxxtt")); } else { Assertions.assertFalse(false, "system must be linux"); - } + } + @Test + void getTotalSystemMemory() throws InterruptedException { + double totalSystemMemory = OSUtils.getTotalSystemMemory(); + Assertions.assertTrue(totalSystemMemory > 0); + // Assert that the memory is not changed + Thread.sleep(1000L); + Assertions.assertEquals(totalSystemMemory, OSUtils.getTotalSystemMemory()); } + @Test + void getSystemMemoryUsed() { + double systemMemoryUsed = OSUtils.getSystemMemoryUsed(); + Assertions.assertTrue(systemMemoryUsed > 0); + double totalSystemMemory = OSUtils.getTotalSystemMemory(); + Assertions.assertTrue(systemMemoryUsed < totalSystemMemory); + } } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java index 553789d384d0a..b65b2273c6d1e 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java @@ -91,8 +91,7 @@ public class MasterConfig implements Validator { * state wheel check interval, if this value is bigger, may increase the delay of task/processInstance. */ private Duration stateWheelInterval = Duration.ofMillis(5); - private double maxCpuLoadAvg = 1; - private double reservedMemory = 0.1; + private MasterServerLoadProtection serverLoadProtection = new MasterServerLoadProtection(); private Duration failoverInterval = Duration.ofMinutes(10); private boolean killApplicationWhenTaskFailover = true; private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); @@ -143,12 +142,6 @@ public void validate(Object target, Errors errors) { if (masterConfig.getFailoverInterval().toMillis() <= 0) { errors.rejectValue("failover-interval", null, "should be a valid duration"); } - if (masterConfig.getMaxCpuLoadAvg() <= 0) { - masterConfig.setMaxCpuLoadAvg(100); - } - if (masterConfig.getReservedMemory() <= 0) { - masterConfig.setReservedMemory(100); - } if (masterConfig.getWorkerGroupRefreshInterval().getSeconds() < 10) { errors.rejectValue("worker-group-refresh-interval", null, "should >= 10s"); @@ -175,8 +168,7 @@ private void printConfig() { "\n task-commit-retry-times -> " + taskCommitRetryTimes + "\n task-commit-interval -> " + taskCommitInterval + "\n state-wheel-interval -> " + stateWheelInterval + - "\n max-cpu-load-avg -> " + maxCpuLoadAvg + - "\n reserved-memory -> " + reservedMemory + + "\n server-load-protection -> " + serverLoadProtection + "\n failover-interval -> " + failoverInterval + "\n kill-application-when-task-failover -> " + killApplicationWhenTaskFailover + "\n registry-disconnect-strategy -> " + registryDisconnectStrategy + diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtection.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtection.java new file mode 100644 index 0000000000000..dbfb32d49b143 --- /dev/null +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtection.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.dolphinscheduler.server.master.config; + +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Data +@NoArgsConstructor +@AllArgsConstructor +public class MasterServerLoadProtection { + + private boolean enabled = true; + + private double maxCpuUsagePercentageThresholds = 0.7; + + private double maxJVMMemoryUsagePercentageThresholds = 0.7; + + private double maxSystemMemoryUsagePercentageThresholds = 0.7; + + private double maxDiskUsagePercentageThresholds = 0.7; + + public boolean isOverload(SystemMetrics systemMetrics) { + if (!enabled) { + return false; + } + if (systemMetrics.getTotalCpuUsedPercentage() > maxCpuUsagePercentageThresholds) { + log.info( + "Master OverLoad: the TotalCpuUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}", + systemMetrics.getTotalCpuUsedPercentage(), maxCpuUsagePercentageThresholds); + return true; + } + if (systemMetrics.getJvmMemoryUsedPercentage() > maxJVMMemoryUsagePercentageThresholds) { + log.info( + "Master OverLoad: the JvmMemoryUsedPercentage: {} is over then the maxCpuUsagePercentageThresholds {}", + systemMetrics.getJvmMemoryUsedPercentage(), maxCpuUsagePercentageThresholds); + return true; + } + if (systemMetrics.getDiskUsedPercentage() > maxDiskUsagePercentageThresholds) { + log.info("Master OverLoad: the DiskUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}", + systemMetrics.getDiskUsedPercentage(), maxCpuUsagePercentageThresholds); + return true; + } + if (systemMetrics.getSystemMemoryUsedPercentage() > maxSystemMemoryUsagePercentageThresholds) { + log.info( + "Worker OverLoad: the SystemMemoryUsedPercentage: {} is over then the MaxSystemMemoryUsagePercentageThresholds {}", + systemMetrics.getSystemMemoryUsedPercentage(), maxSystemMemoryUsagePercentageThresholds); + return true; + } + return false; + } + +} diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/LowerWeightHostManager.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/LowerWeightHostManager.java index 607b78abbcd0e..e14cd26a9f7b3 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/LowerWeightHostManager.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/LowerWeightHostManager.java @@ -130,28 +130,22 @@ private void syncWorkerHostWeight(Map> workerHostWeights } } - public Optional getHostWeight(String addr, String workerGroup, WorkerHeartBeat heartBeat) { + public Optional getHostWeight(String workerAddress, String workerGroup, WorkerHeartBeat heartBeat) { if (heartBeat == null) { - log.warn("worker {} in work group {} have not received the heartbeat", addr, workerGroup); - return Optional.empty(); - } - if (ServerStatus.ABNORMAL == heartBeat.getServerStatus()) { - log.warn("worker {} current cpu load average {} is too high or available memory {}G is too low", - addr, heartBeat.getLoadAverage(), heartBeat.getAvailablePhysicalMemorySize()); + log.warn("Worker {} in WorkerGroup {} have not received the heartbeat", workerAddress, workerGroup); return Optional.empty(); } if (ServerStatus.BUSY == heartBeat.getServerStatus()) { - log.warn("worker {} is busy, current waiting task count {} is large than worker thread count {}", - addr, heartBeat.getWorkerWaitingTaskCount(), heartBeat.getWorkerExecThreadCount()); + log.warn("Worker {} in workerGroup {} is Busy, heartbeat is {}", workerAddress, workerGroup, heartBeat); return Optional.empty(); } return Optional.of( new HostWeight( - HostWorker.of(addr, heartBeat.getWorkerHostWeight(), workerGroup), + HostWorker.of(workerAddress, heartBeat.getWorkerHostWeight(), workerGroup), heartBeat.getCpuUsage(), heartBeat.getMemoryUsage(), - heartBeat.getLoadAverage(), - heartBeat.getWorkerWaitingTaskCount(), + heartBeat.getDiskUsage(), + heartBeat.getThreadPoolUsage(), heartBeat.getStartupTime())); } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/HostWeight.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/HostWeight.java index 2cd65be1f17bb..1efc0a63657b5 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/HostWeight.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/HostWeight.java @@ -20,69 +20,44 @@ import org.apache.dolphinscheduler.extract.base.utils.Constants; import org.apache.dolphinscheduler.extract.base.utils.Host; -/** - * host weight - */ +import lombok.Data; + +@Data public class HostWeight { - private final int CPU_FACTOR = 10; + private final int THREAD_USAGE_FACTOR = 10; + + private final int CPU_USAGE_FACTOR = 20; - private final int MEMORY_FACTOR = 20; + private final int MEMORY_USAGE_FACTOR = 30; - private final int LOAD_AVERAGE_FACTOR = 70; + private final int DISK_USAGE_FACTOR = 40; - private final HostWorker hostWorker; + private final Host host; private final double weight; + // if the weight is small, then is will be chosen first private double currentWeight; - private final int waitingTaskCount; - - public HostWeight(HostWorker hostWorker, double cpu, double memory, double loadAverage, int waitingTaskCount, + public HostWeight(HostWorker hostWorker, + double cpuUsage, + double memoryUsage, + double diskUsage, + double threadPoolUsage, long startTime) { - this.hostWorker = hostWorker; - this.weight = calculateWeight(cpu, memory, loadAverage, startTime); + this.host = hostWorker; + this.weight = calculateWeight(cpuUsage, memoryUsage, diskUsage, threadPoolUsage, startTime); this.currentWeight = this.weight; - this.waitingTaskCount = waitingTaskCount; - } - - public double getWeight() { - return weight; - } - - public double getCurrentWeight() { - return currentWeight; - } - - public void setCurrentWeight(double currentWeight) { - this.currentWeight = currentWeight; - } - - public HostWorker getHostWorker() { - return hostWorker; - } - - public Host getHost() { - return (Host) hostWorker; - } - - public int getWaitingTaskCount() { - return waitingTaskCount; - } - - @Override - public String toString() { - return "HostWeight{" - + "hostWorker=" + hostWorker - + ", weight=" + weight - + ", currentWeight=" + currentWeight - + ", waitingTaskCount=" + waitingTaskCount - + '}'; } - private double calculateWeight(double cpu, double memory, double loadAverage, long startTime) { - double calculatedWeight = cpu * CPU_FACTOR + memory * MEMORY_FACTOR + loadAverage * LOAD_AVERAGE_FACTOR; + private double calculateWeight(double cpuUsage, + double memoryUsage, + double diskUsage, + double threadPoolUsage, + long startTime) { + double calculatedWeight = (1 - cpuUsage) * CPU_USAGE_FACTOR + (1 - memoryUsage) * MEMORY_USAGE_FACTOR + + (1 - diskUsage) * DISK_USAGE_FACTOR + (1 - threadPoolUsage) * THREAD_USAGE_FACTOR; long uptime = System.currentTimeMillis() - startTime; if (uptime > 0 && uptime < Constants.WARM_UP_TIME) { // If the warm-up is not over, add the weight diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/LowerWeightRoundRobin.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/LowerWeightRoundRobin.java index 8ddfff6da58cd..d03fd59adaefe 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/LowerWeightRoundRobin.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/dispatch/host/assign/LowerWeightRoundRobin.java @@ -18,14 +18,6 @@ package org.apache.dolphinscheduler.server.master.dispatch.host.assign; import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; - -import org.springframework.util.CollectionUtils; - -import com.google.common.collect.Lists; /** * lower weight round robin @@ -43,8 +35,7 @@ public HostWeight doSelect(Collection sources) { double totalWeight = 0; double lowWeight = 0; HostWeight lowerNode = null; - List weights = canAssignTaskHost(sources); - for (HostWeight hostWeight : weights) { + for (HostWeight hostWeight : sources) { totalWeight += hostWeight.getWeight(); hostWeight.setCurrentWeight(hostWeight.getCurrentWeight() + hostWeight.getWeight()); if (lowerNode == null || lowWeight > hostWeight.getCurrentWeight()) { @@ -58,24 +49,4 @@ public HostWeight doSelect(Collection sources) { return lowerNode; } - private List canAssignTaskHost(Collection sources) { - if (CollectionUtils.isEmpty(sources)) { - return Collections.emptyList(); - } - List zeroWaitingTask = - sources.stream().filter(h -> h.getWaitingTaskCount() == 0).collect(Collectors.toList()); - if (!zeroWaitingTask.isEmpty()) { - return zeroWaitingTask; - } - HostWeight hostWeight = sources.stream().min(Comparator.comparing(HostWeight::getWaitingTaskCount)).get(); - List waitingTask = Lists.newArrayList(hostWeight); - List equalWaitingTask = sources.stream() - .filter(h -> !h.getHost().equals(hostWeight.getHost()) - && h.getWaitingTaskCount() == hostWeight.getWaitingTaskCount()) - .collect(Collectors.toList()); - if (!equalWaitingTask.isEmpty()) { - waitingTask.addAll(equalWaitingTask); - } - return waitingTask; - } } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java index f2aaf417ed319..935d86eaba14e 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterRegistryClient.java @@ -23,6 +23,7 @@ import org.apache.dolphinscheduler.common.thread.ThreadUtils; import org.apache.dolphinscheduler.common.utils.JSONUtils; import org.apache.dolphinscheduler.common.utils.NetUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.registry.api.RegistryException; import org.apache.dolphinscheduler.registry.api.enums.RegistryNodeType; @@ -54,6 +55,9 @@ public class MasterRegistryClient implements AutoCloseable { @Autowired private MasterConfig masterConfig; + @Autowired + private MetricsProvider metricsProvider; + @Autowired private MasterConnectStrategy masterConnectStrategy; @@ -61,7 +65,7 @@ public class MasterRegistryClient implements AutoCloseable { public void start() { try { - this.masterHeartBeatTask = new MasterHeartBeatTask(masterConfig, registryClient); + this.masterHeartBeatTask = new MasterHeartBeatTask(masterConfig, metricsProvider, registryClient); // master registry registry(); registryClient.addConnectionStateListener( diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java index 5fb0c74f5acb8..d20c52d91ad43 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java @@ -72,7 +72,7 @@ public class SlotChangeListener implements MasterInfoChangeListener { @Override public void notify(Map masterNodeInfo) { List serverList = masterNodeInfo.values().stream() - .filter(heartBeat -> !heartBeat.getServerStatus().equals(ServerStatus.ABNORMAL)) + .filter(heartBeat -> !heartBeat.getServerStatus().equals(ServerStatus.BUSY)) .map(this::convertHeartBeatToServer).collect(Collectors.toList()); syncMasterNodes(serverList); } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerBootstrap.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerBootstrap.java index 4d84644cb08e7..2fddd9438474f 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerBootstrap.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerBootstrap.java @@ -21,11 +21,13 @@ import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; import org.apache.dolphinscheduler.common.thread.BaseDaemonThread; import org.apache.dolphinscheduler.common.thread.ThreadUtils; -import org.apache.dolphinscheduler.common.utils.OSUtils; import org.apache.dolphinscheduler.dao.entity.Command; import org.apache.dolphinscheduler.dao.entity.ProcessInstance; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.server.master.cache.ProcessInstanceExecCacheManager; import org.apache.dolphinscheduler.server.master.config.MasterConfig; +import org.apache.dolphinscheduler.server.master.config.MasterServerLoadProtection; import org.apache.dolphinscheduler.server.master.event.WorkflowEvent; import org.apache.dolphinscheduler.server.master.event.WorkflowEventQueue; import org.apache.dolphinscheduler.server.master.event.WorkflowEventType; @@ -78,6 +80,9 @@ public class MasterSchedulerBootstrap extends BaseDaemonThread implements AutoCl @Autowired private MasterTaskExecutorBootstrap masterTaskExecutorBootstrap; + @Autowired + private MetricsProvider metricsProvider; + protected MasterSchedulerBootstrap() { super("MasterCommandLoopThread"); } @@ -102,11 +107,9 @@ public void close() throws Exception { log.info("MasterSchedulerBootstrap stopped..."); } - /** - * run of MasterSchedulerService - */ @Override public void run() { + MasterServerLoadProtection serverLoadProtection = masterConfig.getServerLoadProtection(); while (!ServerLifeCycleManager.isStopped()) { try { if (!ServerLifeCycleManager.isRunning()) { @@ -115,9 +118,8 @@ public void run() { Thread.sleep(Constants.SLEEP_TIME_MILLIS); } // todo: if the workflow event queue is much, we need to handle the back pressure - boolean isOverload = - OSUtils.isOverload(masterConfig.getMaxCpuLoadAvg(), masterConfig.getReservedMemory()); - if (isOverload) { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + if (serverLoadProtection.isOverload(systemMetrics)) { log.warn("The current server is overload, cannot consumes commands."); MasterServerMetrics.incMasterOverload(); Thread.sleep(Constants.SLEEP_TIME_MILLIS); diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java index f8a9b30e28759..c84f51bc0e588 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java @@ -24,8 +24,11 @@ import org.apache.dolphinscheduler.common.utils.JSONUtils; import org.apache.dolphinscheduler.common.utils.NetUtils; import org.apache.dolphinscheduler.common.utils.OSUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.server.master.config.MasterConfig; +import org.apache.dolphinscheduler.server.master.config.MasterServerLoadProtection; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; @@ -35,6 +38,8 @@ public class MasterHeartBeatTask extends BaseHeartBeatTask { private final MasterConfig masterConfig; + private MetricsProvider metricsProvider; + private final RegistryClient registryClient; private final String heartBeatPath; @@ -42,9 +47,11 @@ public class MasterHeartBeatTask extends BaseHeartBeatTask { private final int processId; public MasterHeartBeatTask(@NonNull MasterConfig masterConfig, + @NonNull MetricsProvider metricsProvider, @NonNull RegistryClient registryClient) { super("MasterHeartBeatTask", masterConfig.getHeartbeatInterval().toMillis()); this.masterConfig = masterConfig; + this.metricsProvider = metricsProvider; this.registryClient = registryClient; this.heartBeatPath = masterConfig.getMasterRegistryPath(); this.processId = OSUtils.getProcessID(); @@ -52,16 +59,17 @@ public MasterHeartBeatTask(@NonNull MasterConfig masterConfig, @Override public MasterHeartBeat getHeartBeat() { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + ServerStatus serverStatus = getServerStatus(systemMetrics, masterConfig.getServerLoadProtection()); return MasterHeartBeat.builder() .startupTime(ServerLifeCycleManager.getServerStartupTime()) .reportTime(System.currentTimeMillis()) - .cpuUsage(OSUtils.cpuUsagePercentage()) - .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) - .reservedMemory(masterConfig.getReservedMemory()) - .memoryUsage(OSUtils.memoryUsagePercentage()) - .diskAvailable(OSUtils.diskAvailable()) + .cpuUsage(systemMetrics.getTotalCpuUsedPercentage()) + .jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage()) + .memoryUsage(systemMetrics.getSystemMemoryUsedPercentage()) + .diskUsage(systemMetrics.getDiskUsedPercentage()) .processId(processId) - .serverStatus(getServerStatus()) + .serverStatus(serverStatus) .host(NetUtils.getHost()) .port(masterConfig.getListenPort()) .build(); @@ -75,9 +83,8 @@ public void writeHeartBeat(MasterHeartBeat masterHeartBeat) { heartBeatPath, masterHeartBeatJson); } - private ServerStatus getServerStatus() { - return OSUtils.isOverload(masterConfig.getMaxCpuLoadAvg(), masterConfig.getReservedMemory()) - ? ServerStatus.ABNORMAL - : ServerStatus.NORMAL; + private ServerStatus getServerStatus(SystemMetrics systemMetrics, + MasterServerLoadProtection masterServerLoadProtection) { + return masterServerLoadProtection.isOverload(systemMetrics) ? ServerStatus.BUSY : ServerStatus.NORMAL; } } diff --git a/dolphinscheduler-master/src/main/resources/application.yaml b/dolphinscheduler-master/src/main/resources/application.yaml index cbf25079f3fca..b23fa0df2d2f4 100644 --- a/dolphinscheduler-master/src/main/resources/application.yaml +++ b/dolphinscheduler-master/src/main/resources/application.yaml @@ -119,10 +119,16 @@ master: # master commit task interval task-commit-interval: 1s state-wheel-interval: 5s - # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu - max-cpu-load-avg: 1 - # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule. - reserved-memory: 0.3 + server-load-protection: + enabled: true + # Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. + max-cpu-usage-percentage-thresholds: 0.8 + # Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. + max-jvm-memory-usage-percentage-thresholds: 0.8 + # Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. + max-system-memory-usage-percentage-thresholds: 0.8 + # Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. + max-disk-usage-percentage-thresholds: 0.8 # failover interval, the unit is minute failover-interval: 10m # kill yarn / k8s application when failover taskInstance, default true diff --git a/dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManagerTest.java b/dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManagerTest.java index 4ee75a0392802..38ece75e4638b 100644 --- a/dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManagerTest.java +++ b/dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManagerTest.java @@ -46,7 +46,7 @@ void testNormalMasterSlots() { // on normal Master side Mockito.when(masterConfig.getMasterAddress()).thenReturn("127.0.0.1:7777"); - sendHeartBeat(ServerStatus.ABNORMAL, ServerStatus.NORMAL); + sendHeartBeat(ServerStatus.BUSY, ServerStatus.NORMAL); Assertions.assertEquals(1, masterSlotManager.getMasterSize()); Assertions.assertEquals(0, masterSlotManager.getSlot()); @@ -60,7 +60,7 @@ void testOverloadMasterSlots() { // on abnormal Master side Mockito.when(masterConfig.getMasterAddress()).thenReturn("127.0.0.1:6666"); - sendHeartBeat(ServerStatus.ABNORMAL, ServerStatus.NORMAL); + sendHeartBeat(ServerStatus.BUSY, ServerStatus.NORMAL); Assertions.assertEquals(0, masterSlotManager.getMasterSize()); Assertions.assertEquals(0, masterSlotManager.getSlot()); diff --git a/dolphinscheduler-meter/pom.xml b/dolphinscheduler-meter/pom.xml index 9fbb578f8cbbe..af3771972113d 100644 --- a/dolphinscheduler-meter/pom.xml +++ b/dolphinscheduler-meter/pom.xml @@ -42,6 +42,12 @@ + + org.apache.dolphinscheduler + dolphinscheduler-common + dev-SNAPSHOT + + org.springframework.boot spring-boot-starter-actuator diff --git a/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/MetricsProvider.java b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/MetricsProvider.java new file mode 100644 index 0000000000000..5371d4daa9ca5 --- /dev/null +++ b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/MetricsProvider.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.dolphinscheduler.meter.metrics; + +public interface MetricsProvider { + + SystemMetrics getSystemMetrics(); + +} diff --git a/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/PrometheusMetricsProvider.java b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/PrometheusMetricsProvider.java new file mode 100644 index 0000000000000..8ae0636c9b71d --- /dev/null +++ b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/PrometheusMetricsProvider.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.dolphinscheduler.meter.metrics; + +import org.apache.dolphinscheduler.common.utils.OSUtils; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import io.micrometer.core.instrument.MeterRegistry; + +@Component +public class PrometheusMetricsProvider implements MetricsProvider { + + @Autowired + private MeterRegistry meterRegistry; + + private SystemMetrics systemMetrics; + + private long lastRefreshTime = 0; + + private static final long SYSTEM_METRICS_REFRESH_INTERVAL = 1_000L; + + @Override + public SystemMetrics getSystemMetrics() { + if (System.currentTimeMillis() - lastRefreshTime < SYSTEM_METRICS_REFRESH_INTERVAL) { + return systemMetrics; + } + + double systemCpuUsage = meterRegistry.get("system.cpu.usage").gauge().value(); + double processCpuUsage = meterRegistry.get("process.cpu.usage").gauge().value(); + + double jvmMemoryUsed = meterRegistry.get("jvm.memory.used").meter().measure().iterator().next().getValue(); + double jvmMemoryMax = meterRegistry.get("jvm.memory.max").meter().measure().iterator().next().getValue(); + + double totalSystemMemory = OSUtils.getTotalSystemMemory(); + double systemMemoryUsed = OSUtils.getSystemMemoryUsed(); + + systemMetrics = SystemMetrics.builder() + .systemCpuUsagePercentage(systemCpuUsage) + .processCpuUsagePercentage(processCpuUsage) + .totalCpuUsedPercentage(systemCpuUsage + processCpuUsage) + .jvmMemoryUsed(jvmMemoryUsed) + .jvmMemoryMax(jvmMemoryMax) + .jvmMemoryUsedPercentage(jvmMemoryUsed / jvmMemoryMax) + .systemMemoryUsed(systemMemoryUsed) + .systemMemoryMax(totalSystemMemory) + .systemMemoryUsedPercentage(systemMemoryUsed / totalSystemMemory) + .build(); + lastRefreshTime = System.currentTimeMillis(); + return systemMetrics; + } + +} diff --git a/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/SystemMetrics.java b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/SystemMetrics.java new file mode 100644 index 0000000000000..dcffafb83dee0 --- /dev/null +++ b/dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/SystemMetrics.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.dolphinscheduler.meter.metrics; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SystemMetrics { + + // CPU + private double systemCpuUsagePercentage; + private double processCpuUsagePercentage; + private double totalCpuUsedPercentage; + + // JVM-Memory + // todo: get pod memory usage + private double jvmMemoryUsed; + private double jvmMemoryMax; + private double jvmMemoryUsedPercentage; + + // System-Memory + // todo: get pod cpu usage + private double systemMemoryUsed; + private double systemMemoryMax; + private double systemMemoryUsedPercentage; + + // Disk + // todo: get pod disk usage + private double diskUsed; + private double diskTotal; + private double diskUsedPercentage; + +} diff --git a/dolphinscheduler-meter/src/main/resources/grafana-demo/docker-compose.yaml b/dolphinscheduler-meter/src/main/resources/grafana-demo/docker-compose.yaml index 185803ad4f471..9a7dcc4f0adfd 100644 --- a/dolphinscheduler-meter/src/main/resources/grafana-demo/docker-compose.yaml +++ b/dolphinscheduler-meter/src/main/resources/grafana-demo/docker-compose.yaml @@ -33,7 +33,7 @@ services: networks: [ test ] ports: # due to the DolphinScheduler frontend port is 3000, so we change the grafana default port to 3001. - - "3001:3000" + - "3000:3000" environment: GF_AUTH_ANONYMOUS_ENABLED: "true" volumes: diff --git a/dolphinscheduler-standalone-server/src/main/resources/application.yaml b/dolphinscheduler-standalone-server/src/main/resources/application.yaml index 363943587bb7d..c3b01801b4388 100644 --- a/dolphinscheduler-standalone-server/src/main/resources/application.yaml +++ b/dolphinscheduler-standalone-server/src/main/resources/application.yaml @@ -188,10 +188,16 @@ master: # master commit task interval task-commit-interval: 1s state-wheel-interval: 5s - # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu - max-cpu-load-avg: 1 - # master reserved memory, only lower than system available memory, master server can schedule. default value 0.1, only the available memory is higher than 10%, master server can schedule. - reserved-memory: 0.1 + server-load-protection: + enabled: true + # Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. + max-cpu-usage-percentage-thresholds: 0.8 + # Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. + max-jvm-memory-usage-percentage-thresholds: 0.8 + # Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. + max-system-memory-usage-percentage-thresholds: 0.8 + # Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. + max-disk-usage-percentage-thresholds: 0.8 # failover interval failover-interval: 10m # kill yarn/k8s application when failover taskInstance, default true @@ -207,10 +213,20 @@ worker: heartbeat-interval: 10s # worker host weight to dispatch tasks, default value 100 host-weight: 100 - # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu. - max-cpu-load-avg: 1 - # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.1, only the available memory is higher than 10%, worker server can receive task. - reserved-memory: 0.1 + # tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true. + tenant-auto-create: true + #Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. + tenant-distributed-user: false + server-load-protection: + enabled: true + # Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks. + max-cpu-usage-percentage-thresholds: 0.8 + # Worker max JVM memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks. + max-jvm-memory-usage-percentage-thresholds: 0.8 + # Worker max System memory usage , when the worker's system memory usage is smaller then this value, worker server can be dispatched tasks. + max-system-memory-usage-percentage-thresholds: 0.8 + # Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks. + max-disk-usage-percentage-thresholds: 0.8 task-execute-threads-full-policy: REJECT tenant-config: # tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true. diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java index e8ae5381fd1e5..5af0c2617ef69 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java @@ -21,11 +21,14 @@ import org.apache.dolphinscheduler.common.constants.Constants; import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; import org.apache.dolphinscheduler.common.thread.ThreadUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.plugin.task.api.TaskExecutionContext; import org.apache.dolphinscheduler.plugin.task.api.TaskPluginManager; import org.apache.dolphinscheduler.plugin.task.api.utils.LogUtils; import org.apache.dolphinscheduler.plugin.task.api.utils.ProcessUtils; import org.apache.dolphinscheduler.server.worker.message.MessageRetryRunner; +import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics; import org.apache.dolphinscheduler.server.worker.registry.WorkerRegistryClient; import org.apache.dolphinscheduler.server.worker.rpc.WorkerRpcServer; import org.apache.dolphinscheduler.server.worker.runner.WorkerTaskExecutor; @@ -63,6 +66,9 @@ public class WorkerServer implements IStoppable { @Autowired private MessageRetryRunner messageRetryRunner; + @Autowired + private MetricsProvider metricsProvider; + /** * worker server startup, not use web service * @@ -83,6 +89,19 @@ public void run() { this.messageRetryRunner.start(); + WorkerServerMetrics.registerWorkerCpuUsageGauge(() -> { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + return systemMetrics.getTotalCpuUsedPercentage(); + }); + WorkerServerMetrics.registerWorkerMemoryAvailableGauge(() -> { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + return (systemMetrics.getSystemMemoryMax() - systemMetrics.getSystemMemoryUsed()) / 1024.0 / 1024 / 1024; + }); + WorkerServerMetrics.registerWorkerMemoryUsageGauge(() -> { + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + return systemMetrics.getJvmMemoryUsedPercentage(); + }); + /* * registry hooks, which are called before the process exits */ diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java index db85d50244eda..b54d8653baedf 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java @@ -45,8 +45,7 @@ public class WorkerConfig implements Validator { private int execThreads = 10; private Duration heartbeatInterval = Duration.ofSeconds(10); private int hostWeight = 100; - private int maxCpuLoadAvg = -1; - private double reservedMemory = 0.1; + private WorkerServerLoadProtection serverLoadProtection = new WorkerServerLoadProtection(); private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); /** @@ -73,9 +72,6 @@ public void validate(Object target, Errors errors) { if (workerConfig.getHeartbeatInterval().getSeconds() <= 0) { errors.rejectValue("heartbeat-interval", null, "shoule be a valid duration"); } - if (workerConfig.getMaxCpuLoadAvg() <= 0) { - workerConfig.setMaxCpuLoadAvg(Runtime.getRuntime().availableProcessors() * 2); - } if (StringUtils.isEmpty(workerConfig.getWorkerAddress())) { workerConfig.setWorkerAddress(NetUtils.getAddr(workerConfig.getListenPort())); } @@ -93,8 +89,7 @@ private void printConfig() { "\n heartbeat-interval -> " + heartbeatInterval + "\n host-weight -> " + hostWeight + "\n tenantConfig -> " + tenantConfig + - "\n max-cpu-load-avg -> " + maxCpuLoadAvg + - "\n reserved-memory -> " + reservedMemory + + "\n server-load-protection -> " + serverLoadProtection + "\n registry-disconnect-strategy -> " + registryDisconnectStrategy + "\n task-execute-threads-full-policy: " + taskExecuteThreadsFullPolicy + "\n address -> " + workerAddress + diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtection.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtection.java new file mode 100644 index 0000000000000..6e68a71bf524f --- /dev/null +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtection.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.dolphinscheduler.server.worker.config; + +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Data +@Slf4j +@NoArgsConstructor +@AllArgsConstructor +public class WorkerServerLoadProtection { + + private boolean enabled = true; + + private double maxCpuUsagePercentageThresholds = 0.7; + + private double maxJVMMemoryUsagePercentageThresholds = 0.7; + + private double maxSystemMemoryUsagePercentageThresholds = 0.7; + + private double maxDiskUsagePercentageThresholds = 0.7; + + public boolean isOverload(SystemMetrics systemMetrics) { + if (!enabled) { + return false; + } + if (systemMetrics.getTotalCpuUsedPercentage() > maxCpuUsagePercentageThresholds) { + log.info( + "Worker OverLoad: the TotalCpuUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}", + systemMetrics.getTotalCpuUsedPercentage(), maxCpuUsagePercentageThresholds); + return true; + } + if (systemMetrics.getJvmMemoryUsedPercentage() > maxJVMMemoryUsagePercentageThresholds) { + log.info( + "Worker OverLoad: the JvmMemoryUsedPercentage: {} is over then the maxCpuUsagePercentageThresholds {}", + systemMetrics.getJvmMemoryUsedPercentage(), maxJVMMemoryUsagePercentageThresholds); + return true; + } + if (systemMetrics.getDiskUsedPercentage() > maxDiskUsagePercentageThresholds) { + log.info("Worker OverLoad: the DiskUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}", + systemMetrics.getDiskUsedPercentage(), maxDiskUsagePercentageThresholds); + return true; + } + if (systemMetrics.getSystemMemoryUsedPercentage() > maxSystemMemoryUsagePercentageThresholds) { + log.info( + "Worker OverLoad: the SystemMemoryUsedPercentage: {} is over then the MaxSystemMemoryUsagePercentageThresholds {}", + systemMetrics.getSystemMemoryUsedPercentage(), maxSystemMemoryUsagePercentageThresholds); + return true; + } + return false; + } + +} diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClient.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClient.java index b383af752db0c..fb0a21ae3b445 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClient.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClient.java @@ -26,6 +26,7 @@ import org.apache.dolphinscheduler.common.thread.ThreadUtils; import org.apache.dolphinscheduler.common.utils.JSONUtils; import org.apache.dolphinscheduler.extract.base.utils.Host; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.registry.api.RegistryException; import org.apache.dolphinscheduler.registry.api.enums.RegistryNodeType; @@ -55,7 +56,7 @@ public class WorkerRegistryClient implements AutoCloseable { private WorkerConfig workerConfig; @Autowired - private WorkerTaskExecutorThreadPool workerManagerThread; + private WorkerTaskExecutorThreadPool workerTaskExecutorThreadPool; @Autowired private RegistryClient registryClient; @@ -64,14 +65,18 @@ public class WorkerRegistryClient implements AutoCloseable { @Lazy private WorkerConnectStrategy workerConnectStrategy; + @Autowired + private MetricsProvider metricsProvider; + private WorkerHeartBeatTask workerHeartBeatTask; @PostConstruct public void initWorkRegistry() { this.workerHeartBeatTask = new WorkerHeartBeatTask( workerConfig, + metricsProvider, registryClient, - () -> workerManagerThread.getWaitingTaskExecutorSize()); + workerTaskExecutorThreadPool); } public void start() { diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/WorkerTaskExecutorThreadPool.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/WorkerTaskExecutorThreadPool.java index 28588c0dbfb4c..0e4bb98080a23 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/WorkerTaskExecutorThreadPool.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/WorkerTaskExecutorThreadPool.java @@ -18,7 +18,6 @@ package org.apache.dolphinscheduler.server.worker.runner; import org.apache.dolphinscheduler.common.thread.ThreadUtils; -import org.apache.dolphinscheduler.common.utils.OSUtils; import org.apache.dolphinscheduler.server.worker.config.TaskExecuteThreadsFullPolicy; import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics; @@ -42,12 +41,8 @@ public WorkerTaskExecutorThreadPool(WorkerConfig workerConfig) { ThreadUtils.newDaemonFixedThreadExecutor("WorkerTaskExecutorThreadPool", workerConfig.getExecThreads()); this.workerConfig = workerConfig; - WorkerServerMetrics.registerWorkerCpuUsageGauge(OSUtils::cpuUsagePercentage); - WorkerServerMetrics.registerWorkerMemoryAvailableGauge(OSUtils::availablePhysicalMemorySize); - WorkerServerMetrics.registerWorkerMemoryUsageGauge(OSUtils::memoryUsagePercentage); - WorkerServerMetrics.registerWorkerExecuteQueueSizeGauge( - () -> threadPoolExecutor.getQueue().size() - threadPoolExecutor.getActiveCount()); - WorkerServerMetrics.registerWorkerActiveExecuteThreadGauge(threadPoolExecutor::getActiveCount); + WorkerServerMetrics.registerWorkerExecuteQueueSizeGauge(this::getWaitingTaskExecutorSize); + WorkerServerMetrics.registerWorkerActiveExecuteThreadGauge(this::getRunningTaskExecutorSize); } public boolean submitWorkerTaskExecutor(WorkerTaskExecutor workerTaskExecutor) { diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java index 1294163e1e983..0f2c52087849b 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java @@ -24,10 +24,12 @@ import org.apache.dolphinscheduler.common.utils.JSONUtils; import org.apache.dolphinscheduler.common.utils.NetUtils; import org.apache.dolphinscheduler.common.utils.OSUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; - -import java.util.function.Supplier; +import org.apache.dolphinscheduler.server.worker.config.WorkerServerLoadProtection; +import org.apache.dolphinscheduler.server.worker.runner.WorkerTaskExecutorThreadPool; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; @@ -38,43 +40,38 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask { private final WorkerConfig workerConfig; private final RegistryClient registryClient; - private final Supplier workerWaitingTaskCount; + private final MetricsProvider metricsProvider; + private final WorkerTaskExecutorThreadPool workerTaskExecutorThreadPool; private final int processId; public WorkerHeartBeatTask(@NonNull WorkerConfig workerConfig, + @NonNull MetricsProvider metricsProvider, @NonNull RegistryClient registryClient, - @NonNull Supplier workerWaitingTaskCount) { + @NonNull WorkerTaskExecutorThreadPool workerTaskExecutorThreadPool) { super("WorkerHeartBeatTask", workerConfig.getHeartbeatInterval().toMillis()); + this.metricsProvider = metricsProvider; this.workerConfig = workerConfig; this.registryClient = registryClient; - this.workerWaitingTaskCount = workerWaitingTaskCount; + this.workerTaskExecutorThreadPool = workerTaskExecutorThreadPool; this.processId = OSUtils.getProcessID(); } @Override public WorkerHeartBeat getHeartBeat() { - double cpuUsagePercentage = OSUtils.cpuUsagePercentage(); - int maxCpuUsePercentage = workerConfig.getMaxCpuLoadAvg(); - double reservedMemory = workerConfig.getReservedMemory(); - double memoryUsagePercentage = OSUtils.memoryUsagePercentage(); - int execThreads = workerConfig.getExecThreads(); - ServerStatus serverStatus = - getServerStatus(cpuUsagePercentage, maxCpuUsePercentage, memoryUsagePercentage, reservedMemory, - execThreads, this.workerWaitingTaskCount.get()); + SystemMetrics systemMetrics = metricsProvider.getSystemMetrics(); + ServerStatus serverStatus = getServerStatus(systemMetrics, workerConfig, workerTaskExecutorThreadPool); return WorkerHeartBeat.builder() .startupTime(ServerLifeCycleManager.getServerStartupTime()) .reportTime(System.currentTimeMillis()) - .cpuUsage(cpuUsagePercentage) - .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) - .memoryUsage(OSUtils.memoryUsagePercentage()) - .reservedMemory(reservedMemory) - .diskAvailable(OSUtils.diskAvailable()) + .cpuUsage(systemMetrics.getTotalCpuUsedPercentage()) + .jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage()) + .memoryUsage(systemMetrics.getSystemMemoryUsedPercentage()) .processId(processId) .workerHostWeight(workerConfig.getHostWeight()) - .workerWaitingTaskCount(this.workerWaitingTaskCount.get()) - .workerExecThreadCount(workerConfig.getExecThreads()) + .threadPoolUsage(workerTaskExecutorThreadPool.getRunningTaskExecutorSize() + + workerTaskExecutorThreadPool.getWaitingTaskExecutorSize()) .serverStatus(serverStatus) .host(NetUtils.getHost()) .port(workerConfig.getListenPort()) @@ -91,23 +88,13 @@ public void writeHeartBeat(WorkerHeartBeat workerHeartBeat) { workerRegistryPath, workerHeartBeatJson); } - private ServerStatus getServerStatus(double cpuUsagePercentage, - double maxCpuUsePercentage, - double memoryUsagePercentage, - double reservedMemory, - int workerExecThreadCount, - int workerWaitingTaskCount) { - if (cpuUsagePercentage > maxCpuUsePercentage || (1 - memoryUsagePercentage) < reservedMemory) { - log.warn( - "current cpu load average {} is higher than {} or available memory {} is lower than {}", - cpuUsagePercentage, maxCpuUsePercentage, 1 - memoryUsagePercentage, reservedMemory); - return ServerStatus.ABNORMAL; - } else if (workerWaitingTaskCount > workerExecThreadCount) { - log.warn("current waiting task count {} is large than worker thread count {}, worker is busy", - workerWaitingTaskCount, workerExecThreadCount); + private ServerStatus getServerStatus(SystemMetrics systemMetrics, + WorkerConfig workerConfig, + WorkerTaskExecutorThreadPool workerTaskExecutorThreadPool) { + if (workerTaskExecutorThreadPool.isOverload()) { return ServerStatus.BUSY; - } else { - return ServerStatus.NORMAL; } + WorkerServerLoadProtection serverLoadProtection = workerConfig.getServerLoadProtection(); + return serverLoadProtection.isOverload(systemMetrics) ? ServerStatus.BUSY : ServerStatus.NORMAL; } } diff --git a/dolphinscheduler-worker/src/main/resources/application.yaml b/dolphinscheduler-worker/src/main/resources/application.yaml index 50a3f199172fa..91e8af8a4c3e4 100644 --- a/dolphinscheduler-worker/src/main/resources/application.yaml +++ b/dolphinscheduler-worker/src/main/resources/application.yaml @@ -47,10 +47,16 @@ worker: heartbeat-interval: 10s # worker host weight to dispatch tasks, default value 100 host-weight: 100 - # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu. - max-cpu-load-avg: 1 - # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task. - reserved-memory: 0.3 + server-load-protection: + enabled: true + # Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks. + max-cpu-usage-percentage-thresholds: 0.7 + # Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks. + max-jvm-memory-usage-percentage-thresholds: 0.7 + # Worker max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. + max-system-memory-usage-percentage-thresholds: 0.7 + # Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks. + max-disk-usage-percentage-thresholds: 0.7 registry-disconnect-strategy: # The disconnect strategy: stop, waiting strategy: waiting diff --git a/dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClientTest.java b/dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClientTest.java index e6043486f5699..9c3a12294fbd1 100644 --- a/dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClientTest.java +++ b/dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/registry/WorkerRegistryClientTest.java @@ -20,14 +20,15 @@ import static org.mockito.BDDMockito.given; import org.apache.dolphinscheduler.common.utils.NetUtils; +import org.apache.dolphinscheduler.meter.metrics.MetricsProvider; +import org.apache.dolphinscheduler.meter.metrics.SystemMetrics; import org.apache.dolphinscheduler.registry.api.RegistryClient; import org.apache.dolphinscheduler.registry.api.enums.RegistryNodeType; import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; +import org.apache.dolphinscheduler.server.worker.config.WorkerServerLoadProtection; import org.apache.dolphinscheduler.server.worker.runner.WorkerTaskExecutorThreadPool; import java.time.Duration; -import java.util.Set; -import java.util.concurrent.ScheduledExecutorService; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -36,10 +37,6 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Sets; /** * worker registry test @@ -47,10 +44,6 @@ @ExtendWith(MockitoExtension.class) public class WorkerRegistryClientTest { - private static final Logger log = LoggerFactory.getLogger(WorkerRegistryClientTest.class); - - private static final String TEST_WORKER_GROUP = "test"; - @InjectMocks private WorkerRegistryClient workerRegistryClient; @@ -61,10 +54,7 @@ public class WorkerRegistryClientTest { private WorkerConfig workerConfig; @Mock - private Set workerGroups = Sets.newHashSet("127.0.0.1"); - - @Mock - private ScheduledExecutorService heartBeatExecutor; + private MetricsProvider metricsProvider; @Mock private WorkerTaskExecutorThreadPool workerManagerThread; @@ -72,17 +62,13 @@ public class WorkerRegistryClientTest { @Mock private WorkerConnectStrategy workerConnectStrategy; - // private static final Set workerGroups; - - static { - // workerGroups = Sets.newHashSet(DEFAULT_WORKER_GROUP, TEST_WORKER_GROUP); - } - @Test public void testStart() { given(workerConfig.getWorkerAddress()).willReturn(NetUtils.getAddr(1234)); given(workerConfig.getHeartbeatInterval()).willReturn(Duration.ofSeconds(1)); + given(workerConfig.getServerLoadProtection()).willReturn(new WorkerServerLoadProtection()); + given(metricsProvider.getSystemMetrics()).willReturn(new SystemMetrics()); given(registryClient.checkNodeExists(Mockito.anyString(), Mockito.any(RegistryNodeType.class))) .willReturn(true);