Skip to content

Commit

Permalink
feat: enable mofed upgrade flow by default (Mellanox#705)
Browse files Browse the repository at this point in the history
enable mofed upgrade flow by default in helm and ocp deployments.
  • Loading branch information
adrianchiris committed Dec 17, 2023
2 parents 10ec312 + d80be8b commit 3f67c24
Show file tree
Hide file tree
Showing 17 changed files with 79 additions and 20 deletions.
7 changes: 5 additions & 2 deletions bundle/manifests/mellanox.com_nicclusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,6 @@ spec:
description: NVIPAMSpec describes configuration options for nv-ipam
1. Image information for nv-ipam 2. Configuration for nv-ipam
properties:
config:
type: string
enableWebhook:
default: false
description: Enable deployment of the validation webhook
Expand Down Expand Up @@ -554,6 +552,11 @@ spec:
will be upgraded in parallel
minimum: 0
type: integer
safeLoad:
default: false
description: SafeLoad turn on safe driver loading (cordon
and drain the node before loading the driver)
type: boolean
waitForCompletion:
description: WaitForCompletionSpec describes the configuration
for waiting on job completions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ metadata:
},
"terminationGracePeriodSeconds": 300,
"upgradePolicy": {
"autoUpgrade": false,
"autoUpgrade": true,
"drain": {
"deleteEmptyDir": true,
"enable": true,
Expand Down Expand Up @@ -95,7 +95,7 @@ metadata:
provider: NVIDIA
repository: https://github.com/Mellanox/network-operator/
support: NVIDIA
name: nvidia-network-operator.v23.7.0
name: nvidia-network-operator.v24.1.0
namespace: placeholder
spec:
apiservicedefinitions: {}
Expand Down Expand Up @@ -536,6 +536,14 @@ spec:
env:
- name: ENABLE_WEBHOOKS
value: "true"
- name: STATE_MANIFEST_BASE_DIR
value: /manifests
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: OFED_INIT_CONTAINER_IMAGE
value: ghcr.io/mellanox/network-operator-init-container:v0.0.2
image: nvcr.io/nvidia/cloud-native/network-operator@sha256:7005fa24a1ae52d927e76d50d90fddf6b6c7b08885a2dad3c7e5e2c2ac21c834
imagePullPolicy: IfNotPresent
livenessProbe:
Expand Down Expand Up @@ -643,7 +651,7 @@ spec:
provider:
name: NVIDIA
url: https://github.com/Mellanox/network-operator/
version: 23.7.0
version: 24.1.0
webhookdefinitions:
- admissionReviewVersions:
- v1
Expand Down
2 changes: 1 addition & 1 deletion config/samples/mellanox.com_v1alpha1_nicclusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
periodSeconds: 20
terminationGracePeriodSeconds: 300
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
6 changes: 3 additions & 3 deletions deployment/network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -418,14 +418,14 @@ imagePullSecrets:
| `ofedDriver.livenessProbe.periodSeconds` | int | 30 | Mellanox OFED liveness probe interval |
| `ofedDriver.readinessProbe.initialDelaySeconds` | int | 10 | Mellanox OFED readiness probe initial delay |
| `ofedDriver.readinessProbe.periodSeconds` | int | 30 | Mellanox OFED readiness probe interval |
| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `false` | global switch for automatic upgrade feature |
| `ofedDriver.upgradePolicy.autoUpgrade` | bool | `true` | global switch for automatic upgrade feature |
| `ofedDriver.upgradePolicy.maxParallelUpgrades` | int | 1 | how many nodes can be upgraded in parallel, 0 means no limit, all nodes will be upgraded in parallel |
| `ofedDriver.upgradePolicy.safeLoad` | bool | `false` | cordon and drain (if enabled) a node before loading the driver on it, requires `ofedDriver.initContainer` to be enabled and `ofedDriver.upgradePolicy.autoUpgrade` to be true |
| `ofedDriver.upgradePolicy.drain.enable` | bool | `true` | drain a node before the driver restart |
| `ofedDriver.upgradePolicy.drain.force` | bool | `false` | use force drain (check `kubectl drain` doc for details) |
| `ofedDriver.upgradePolicy.drain.force` | bool | `true` | use force drain (check `kubectl drain` doc for details) |
| `ofedDriver.upgradePolicy.drain.podSelector` | string | "" | drain only pods matching this selector |
| `ofedDriver.upgradePolicy.drain.timeoutSeconds` | int | 300 | timeout for drain operation |
| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `false` | continue even if there are pods using emptyDir |
| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `true` | continue even if there are pods using emptyDir |
| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade |
| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite |

Expand Down
6 changes: 3 additions & 3 deletions deployment/network-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ ofedDriver:
upgradePolicy:
# global switch for automatic upgrade feature
# if set to false all other options are ignored
autoUpgrade: false
autoUpgrade: true
# how many nodes can be upgraded in parallel (default: 1)
# 0 means no limit, all nodes will be upgraded in parallel
maxParallelUpgrades: 1
Expand All @@ -207,11 +207,11 @@ ofedDriver:
# removing PODs from the node
drain:
enable: true
force: false
force: true
podSelector: ""
# It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
timeoutSeconds: 300
deleteEmptyDir: false
deleteEmptyDir: true
waitForCompletion:
# specifies a label selector for the pods to wait for completion
# podSelector: "app=myapp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: mofed
repository: nvcr.io/nvstaging/mellanox
version: 23.10-0.5.5.0
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: mofed
repository: nvcr.io/nvstaging/mellanox
version: 23.10-0.5.5.0
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
repository: nvcr.io/nvstaging/mellanox
version: 23.10-0.5.5.0
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
repository: nvcr.io/nvstaging/mellanox
version: 23.10-0.5.5.0
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
8 changes: 8 additions & 0 deletions example/crs/mellanox.com_v1alpha1_nicclusterpolicy_cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: mofed
repository: nvcr.io/nvstaging/mellanox
version: 23.10-0.5.5.0
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: {{ .Mofed.Image }}
repository: {{ .Mofed.Repository }}
version: {{ .Mofed.Version }}
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: {{ .Mofed.Image }}
repository: {{ .Mofed.Repository }}
version: {{ .Mofed.Version }}
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
repository: {{ .Mofed.Repository }}
version: {{ .Mofed.Version }}
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
repository: {{ .Mofed.Repository }}
version: {{ .Mofed.Version }}
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ spec:
image: {{ .Mofed.Image }}
repository: {{ .Mofed.Repository }}
version: {{ .Mofed.Version }}
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
force: true
timeoutSeconds: 300
maxParallelUpgrades: 1
startupProbe:
initialDelaySeconds: 10
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
periodSeconds: 20
terminationGracePeriodSeconds: 300
upgradePolicy:
autoUpgrade: false
autoUpgrade: true
drain:
deleteEmptyDir: true
enable: true
Expand Down
6 changes: 3 additions & 3 deletions hack/templates/values/values.template
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ ofedDriver:
upgradePolicy:
# global switch for automatic upgrade feature
# if set to false all other options are ignored
autoUpgrade: false
autoUpgrade: true
# how many nodes can be upgraded in parallel (default: 1)
# 0 means no limit, all nodes will be upgraded in parallel
maxParallelUpgrades: 1
Expand All @@ -207,11 +207,11 @@ ofedDriver:
# removing PODs from the node
drain:
enable: true
force: false
force: true
podSelector: ""
# It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries
timeoutSeconds: 300
deleteEmptyDir: false
deleteEmptyDir: true
waitForCompletion:
# specifies a label selector for the pods to wait for completion
# podSelector: "app=myapp"
Expand Down

0 comments on commit 3f67c24

Please sign in to comment.