Skip to content

Commit

Permalink
support the affinity feature of k8s which define the rule of assignin…
Browse files Browse the repository at this point in the history
…g pods to nodes (#475)
  • Loading branch information
xiaojingchen authored and weekface committed May 22, 2019
1 parent 57d9075 commit 8917fa2
Show file tree
Hide file tree
Showing 11 changed files with 195 additions and 390 deletions.
2 changes: 1 addition & 1 deletion charts/tidb-cluster/templates/config/_pd-config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ max-replicas = {{ .Values.pd.maxReplicas }}
# The placement priorities is implied by the order of label keys.
# For example, ["zone", "rack"] means that we should place replicas to
# different zones first, then to different racks if we don't have enough zones.
location-labels = ["zone", "rack", "host"]
location-labels = ["region", "zone", "rack", "host"]

[label-property]
# Do not assign region leaders to stores that have these tags.
Expand Down
15 changes: 6 additions & 9 deletions charts/tidb-cluster/templates/tidb-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,10 @@ spec:
{{- if .Values.pd.resources }}
{{ toYaml .Values.pd.resources | indent 4 }}
{{- end }}
{{- if .Values.pd.nodeSelector }}
affinity:
{{ toYaml .Values.pd.affinity | indent 6 }}
nodeSelector:
{{ toYaml .Values.pd.nodeSelector | indent 6 }}
{{- end }}
nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }}
{{- if .Values.pd.tolerations }}
tolerations:
{{ toYaml .Values.pd.tolerations | indent 4 }}
Expand All @@ -56,11 +55,10 @@ spec:
{{- if .Values.tikv.resources }}
{{ toYaml .Values.tikv.resources | indent 4 }}
{{- end }}
{{- if .Values.tikv.nodeSelector }}
affinity:
{{ toYaml .Values.tikv.affinity | indent 6 }}
nodeSelector:
{{ toYaml .Values.tikv.nodeSelector | indent 6 }}
{{- end }}
nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }}
{{- if .Values.tikv.tolerations }}
tolerations:
{{ toYaml .Values.tikv.tolerations | indent 4 }}
Expand All @@ -76,11 +74,10 @@ spec:
{{- if .Values.tidb.resources }}
{{ toYaml .Values.tidb.resources | indent 4 }}
{{- end }}
{{- if .Values.tidb.nodeSelector }}
affinity:
{{ toYaml .Values.tidb.affinity | indent 6 }}
nodeSelector:
{{ toYaml .Values.tidb.nodeSelector | indent 6 }}
{{- end }}
nodeSelectorRequired: {{ .Values.nodeSelectorRequired | default true }}
{{- if .Values.tidb.tolerations }}
tolerations:
{{ toYaml .Values.tidb.tolerations | indent 4 }}
Expand Down
103 changes: 88 additions & 15 deletions charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,72 @@ pd:
# cpu: 4000m
# memory: 4Gi
storage: 1Gi
# nodeSelector is used for scheduling pod,
# if nodeSelectorRequired is true, all the following labels must be matched

## affinity defines pd scheduling rules,it's default settings is empty.
## please read the affinity document before set your scheduling rule:
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
## The following is typical example of affinity settings:
## The PodAntiAffinity setting of the example keeps PD pods does not co-locate on a topology node as far as possible to improve the disaster tolerance of PD on Kubernetes.
## The NodeAffinity setting of the example ensure that the PD pods can only be scheduled to nodes with label:[type="pd"],
# affinity:
# podAntiAffinity:
# preferredDuringSchedulingIgnoredDuringExecution:
# # this term work when the nodes have the label named region
# - weight: 10
# podAffinityTerm:
# labelSelector:
# matchLabels:
# app.kubernetes.io/instance: <release name>
# app.kubernetes.io/component: "pd"
# topologyKey: "region"
# namespaces:
# - <helm namespace>
# # this term work when the nodes have the label named zone
# - weight: 20
# podAffinityTerm:
# labelSelector:
# matchLabels:
# app.kubernetes.io/instance: <release name>
# app.kubernetes.io/component: "pd"
# topologyKey: "zone"
# namespaces:
# - <helm namespace>
# # this term work when the nodes have the label named rack
# - weight: 40
# podAffinityTerm:
# labelSelector:
# matchLabels:
# app.kubernetes.io/instance: <release name>
# app.kubernetes.io/component: "pd"
# topologyKey: "rack"
# namespaces:
# - <helm namespace>
# # this term work when the nodes have the label named kubernetes.io/hostname
# - weight: 80
# podAffinityTerm:
# labelSelector:
# matchLabels:
# app.kubernetes.io/instance: <release name>
# app.kubernetes.io/component: "pd"
# topologyKey: "kubernetes.io/hostname"
# namespaces:
# - <helm namespace>
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: "kind"
# operator: In
# values:
# - "pd"

## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels
## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector
nodeSelector: {}
# kind: pd
# # zone is comma separated availability zone list
# zone: cn-bj1-01,cn-bj1-02
# # region is comma separated region list
# region: cn-bj1
# Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints.
# refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration

## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints.
## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration
tolerations: []
# - key: node-role
# operator: Equal
Expand Down Expand Up @@ -117,10 +173,18 @@ tikv:
# cpu: 12000m
# memory: 24Gi
storage: 10Gi

## affinity defines tikv scheduling rules,affinity default settings is empty.
## please read the affinity document before set your scheduling rule:
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}

## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels
## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector
nodeSelector: {}
# kind: tikv
# zone: cn-bj1-01,cn-bj1-02
# region: cn-bj1

## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints.
## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration
tolerations: []
# - key: node-role
# operator: Equal
Expand Down Expand Up @@ -196,10 +260,19 @@ tidb:
requests: {}
# cpu: 12000m
# memory: 12Gi


## affinity defines tikv scheduling rules,affinity default settings is empty.
## please read the affinity document before set your scheduling rule:
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}

## nodeSelector ensure pods only assigning to nodes which have each of the indicated key-value pairs as labels
## ref:https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector
nodeSelector: {}
# kind: tidb
# zone: cn-bj1-01,cn-bj1-02
# region: cn-bj1

## Tolerations are applied to pods, and allow pods to schedule onto nodes with matching taints.
## refer to https://kubernetes.io/docs/concepts/configuration/taint-and-toleration
tolerations: []
# - key: node-role
# operator: Equal
Expand Down
63 changes: 56 additions & 7 deletions docs/operation-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,70 @@ $ namespace="tidb"

> **Note:** The rest of the document will use `values.yaml` to reference `charts/tidb-cluster/values.yaml`
## Configuration

TiDB Operator uses `values.yaml` as TiDB cluster configuration file. It provides the default basic configuration which you can use directly for quick deployment, but if you have specific configuration requirements or for production deployment, you need to manually modify the variables in the `values.yaml` file.

* Resource setting

* CPU & Memory

The default deployment doesn't set CPU and memory requests or limits for any of the pods, these settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://pingcap.com/docs/dev/how-to/deploy/hardware-recommendations/#software-and-hardware-recommendations).
The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed).

* Storage

The variables `pd.storageClassName` and `tikv.storageClassName` in `values.yaml` are used to set `StorageClass` of PD and TiKV,their default setting are `local-storage` with minimal size.

If you don't want to use the default `StorageClass` or your Kubernetes cluster does not support `local-storage` class, please execute the following command to find an available `StorageClass` and select the ones you want to provide to TiDB cluster.

```shell
$ kubectl get sc
```

* Disaster Tolerance setting

TiDB is a distributed database. Its disaster tolerance means that when any physical node failed, not only to ensure TiDB server is available, but also ensure the data is complete and available.

How to guarantee Disaster Tolerance of TiDB cluster on Kubernetes?

We mainly solve the problem from the scheduling of services and data.

* Disaster Tolerance of TiDB instance

TiDB Operator provides an extended scheduler to guarantee PD/TiKV/TiDB instance disaster tolerance on host level. TiDB Cluster has set the extended scheduler as default scheduler, you will find the setting in the variable `schedulerName` of `values.yaml`.

In the other hand use `PodAntiAffinity` term of `affinity` to ensure disaster tolerance on the other topology levels (e.g. rack, zone, region).
refer to the doc: [pod affnity & anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#inter-pod-affinity-and-anti-affinity-beta-feature), moreover `values.yaml` also provides a typical disaster tolerance setting example in the comments of `pd.affinity`.

* Disaster Tolerance of data

Disaster tolerance of data is guaranteed by TiDB Cluster itself. The only work Operator needs to do is that collects topology info from specific labels of Kubernetes nodes where TiKV Pod runs on and then PD will schedule data replicas auto according to the topology info.
Because current TiDB Operator can only recognize some specific labels, so you can only set nodes topology info with the following particular labels

* `region`: region where node is located
* `zone`: zone where node is located
* `rack`: rack where node is located
* `kubernetes.io/hostname`: hostname of the node

you need label topology info to nodes of Kubernetes cluster use the following command
```shell
# Not all tags are required
$ kubectl label node <nodeName> region=<regionName> zone=<zoneName> rack=<rackName> kubernetes.io/hostname=<hostName>
```

For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts.

## Deploy TiDB cluster

After TiDB Operator and Helm are deployed correctly, TiDB cluster can be deployed using following command:
After TiDB Operator and Helm are deployed correctly and configuration completed, TiDB cluster can be deployed using following command:

```shell
$ helm install charts/tidb-cluster --name=${releaseName} --namespace=${namespace}
$ kubectl get po -n ${namespace} -l app.kubernetes.io/instance=${releaseName}
```

The default deployment doesn't set CPU and memory requests or limits for any of the pods, and the storage used is `local-storage` with minimal size. These settings can make TiDB cluster run on a small Kubernetes cluster like DinD or the default GKE cluster for testing. But for production deployment, you would likely to adjust the cpu, memory and storage resources according to the [recommendations](https://github.com/pingcap/docs/blob/master/op-guide/recommendation.md).

The resource limits should be equal or bigger than the resource requests, it is suggested to set limit and request equal to get [`Guaranteed` QoS]( https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/#create-a-pod-that-gets-assigned-a-qos-class-of-guaranteed).

For other settings, the variables in `values.yaml` are self-explanatory with comments. You can modify them according to your need before installing the charts.

## Access TiDB cluster

By default TiDB service is exposed using [`NodePort`](https://kubernetes.io/docs/concepts/services-networking/service/#nodeport). You can modify it to `ClusterIP` which will disable access from outside of the cluster. Or modify it to [`LoadBalancer`](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer) if the underlining Kubernetes supports this kind of service.
Expand Down
47 changes: 23 additions & 24 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@ package v1alpha1

import (
apps "k8s.io/api/apps/v1beta1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"

corev1 "k8s.io/api/core/v1"
)

const (
Expand Down Expand Up @@ -107,27 +106,27 @@ type TidbClusterStatus struct {
// PDSpec contains details of PD member
type PDSpec struct {
ContainerSpec
Replicas int32 `json:"replicas"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Replicas int32 `json:"replicas"`
Affinity *corev1.Affinity `json:"affinity,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
}

// TiDBSpec contains details of PD member
type TiDBSpec struct {
ContainerSpec
Replicas int32 `json:"replicas"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
BinlogEnabled bool `json:"binlogEnabled,omitempty"`
MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"`
SeparateSlowLog bool `json:"separateSlowLog,omitempty"`
SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"`
Replicas int32 `json:"replicas"`
Affinity *corev1.Affinity `json:"affinity,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
BinlogEnabled bool `json:"binlogEnabled,omitempty"`
MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"`
SeparateSlowLog bool `json:"separateSlowLog,omitempty"`
SlowLogTailer TiDBSlowLogTailerSpec `json:"slowLogTailer,omitempty"`
}

// TiDBSlowLogTailerSpec represents an optional log tailer sidecar with TiDB
Expand All @@ -138,12 +137,12 @@ type TiDBSlowLogTailerSpec struct {
// TiKVSpec contains details of PD member
type TiKVSpec struct {
ContainerSpec
Replicas int32 `json:"replicas"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
NodeSelectorRequired bool `json:"nodeSelectorRequired,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Replicas int32 `json:"replicas"`
Affinity *corev1.Affinity `json:"affinity,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
}

// TiKVPromGatewaySpec runs as a sidecar with TiKVSpec
Expand Down
15 changes: 15 additions & 0 deletions pkg/apis/pingcap.com/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 2 additions & 6 deletions pkg/manager/member/pd_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,12 +477,8 @@ func (pmm *pdMemberManager) getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster)
},
Spec: corev1.PodSpec{
SchedulerName: tc.Spec.SchedulerName,
Affinity: util.AffinityForNodeSelector(
ns,
tc.Spec.PD.NodeSelectorRequired,
label.New().Instance(instanceName).PD(),
tc.Spec.PD.NodeSelector,
),
Affinity: tc.Spec.PD.Affinity,
NodeSelector: tc.Spec.PD.NodeSelector,
Containers: []corev1.Container{
{
Name: v1alpha1.PDMemberType.String(),
Expand Down
8 changes: 2 additions & 6 deletions pkg/manager/member/tidb_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,8 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust
},
Spec: corev1.PodSpec{
SchedulerName: tc.Spec.SchedulerName,
Affinity: util.AffinityForNodeSelector(
ns,
tc.Spec.TiDB.NodeSelectorRequired,
label.New().Instance(instanceName).TiDB(),
tc.Spec.TiDB.NodeSelector,
),
Affinity: tc.Spec.TiDB.Affinity,
NodeSelector: tc.Spec.TiDB.NodeSelector,
Containers: containers,
RestartPolicy: corev1.RestartPolicyAlways,
Tolerations: tc.Spec.TiDB.Tolerations,
Expand Down
Loading

0 comments on commit 8917fa2

Please sign in to comment.