diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dd306c..5d7f57e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.16.0 +- Fixing issues with sailfish instances missing logs +- Adding ability to use Spot Instances! + +Removing application/product specific tolerations on MachineSets. By default, the Runner will always schedule on the Sailfish Machines, however the Run Manager schedules on any worker. More information on how this works can be found in `docs/sailfish-machines.md` as well on how to implement Spot Machines in `docs/features/spot-machinesets.md` + ## v0.15.0 Adding Overlays for Demo diff --git a/docs/features/spot-machinesets.md b/docs/features/spot-machinesets.md new file mode 100644 index 0000000..cf747c3 --- /dev/null +++ b/docs/features/spot-machinesets.md @@ -0,0 +1,19 @@ +# Spot MachineSets +To save up to 80% of your compute cost, use VMs from the Spot Market. Discounts vary over time and are typically higher outside of office hours. +Just so you know, Openshift Licenses do not come with a discount and will be charged per hour on pay-as-you-go rate +## Downsides +The Public cloud provider reserves the right to withdraw your machine with a 30s notice. This will cause all of your pods running on that Machine to be evicted. +In sailfish, that is no problem, as the message in the queue will be put back for another pod to pick up, however if your runners run for a long duration, this can result in a more significant delay. + +## How to enable +In your MachineSet ArgoCD Application, simply add the parameter: + +``` + helm: + parameters: + - name: enableSpotVM + value: 'true' +``` +This will ensure that the VM type you selected with the parameter: `vmSize` will be from the Spot Market. + + diff --git a/docs/sailfish-machines.md b/docs/sailfish-machines.md new file mode 100644 index 0000000..cb8a813 --- /dev/null +++ b/docs/sailfish-machines.md @@ -0,0 +1,44 @@ +# Sailfish Machines +The basic configuration of Sailfish contains two components, a Run Manager and a Runner. +The Run Manager are usually light weight, as they only split the job into tasks and submit them to a queue. To prevent waiting for just one machine to spin up to handle that, we recommend that you should schedule the run manager in your Worker machinesets +However, if the Run Manager is heavy and scalable, we recommend to add a `nodeSelector` to schedule them on the sailfish machines, just like the runners + + +## Taints and Tolerations +Using the MachineSet Helm chart declared in `/k8s/cluster-config/machinesets` you will get three machinesets, one in each zone. + +All these Sailfish Machines are by default Tainted with this: +``` +- effect: NoSchedule + key: application + value: sailfish-hpc +``` +To have your Runners schedule here, they are by default tolerating this taint by declaring this under `/spec/jobTargetRef/template/spec`: +``` +tolerations: + - effect: NoSchedule + key: application + value: sailfish-hpc +``` +You'd need to make a kustomize `add` operation with the same toleration if you wish to schedule the run-manager in Sailfish Machines. + +In addition to the tolerations, you also need to point the workloads to land on these Nodes with the NodeSelector + +## NodeSelector Label +All Sailfish Machines have this label: +``` +metadata: + labels: + sailfish/application: {{ .Values.application }} +``` +By default the Runners also implement this label in `spec/jobTargetRef/template/spec`: +``` +nodeSelector: + sailfish/application: sailfish +``` +Similarly you'd need to make a kustomize `add` operation on the run-manager to schedule it on Sailfish Machines. + +If you add the `tolerations` without adding the nodeSelector, you will risk running your workload on any Sailfish instance that is present in your cluster! + +## Other workloads +In this documentation we've only mentioned scheduling the Run Manager to the Sailfish Machines, but goes with any additional workload that you wish to schedule on the Sailfish Machines. \ No newline at end of file diff --git a/k8s/cluster-config/machinesets/templates/zone-1/machineset.yaml b/k8s/cluster-config/machinesets/templates/zone-1/machineset.yaml index 6b70f54..61bf3f2 100644 --- a/k8s/cluster-config/machinesets/templates/zone-1/machineset.yaml +++ b/k8s/cluster-config/machinesets/templates/zone-1/machineset.yaml @@ -29,6 +29,9 @@ spec: sailfish/application: {{ .Values.application }} providerSpec: value: + {{- if .Values.enableSpotVM }} + spotVMOptions: {} + {{- end }} osDisk: diskSettings: {} diskSizeGB: 128 @@ -61,6 +64,5 @@ spec: apiVersion: azureproviderconfig.openshift.io/v1beta1 taints: - effect: NoSchedule - key: sailfish - value: {{ .Values.application }} - + key: application + value: sailfish-hpc diff --git a/k8s/cluster-config/machinesets/templates/zone-2/machineset.yaml b/k8s/cluster-config/machinesets/templates/zone-2/machineset.yaml index 9fab5e3..d90f053 100644 --- a/k8s/cluster-config/machinesets/templates/zone-2/machineset.yaml +++ b/k8s/cluster-config/machinesets/templates/zone-2/machineset.yaml @@ -29,6 +29,9 @@ spec: sailfish/application: {{ .Values.application }} providerSpec: value: + {{- if .Values.enableSpotVM }} + spotVMOptions: {} + {{- end }} osDisk: diskSettings: {} diskSizeGB: 128 @@ -61,6 +64,6 @@ spec: apiVersion: azureproviderconfig.openshift.io/v1beta1 taints: - effect: NoSchedule - key: sailfish - value: {{ .Values.application }} + key: application + value: sailfish-hpc diff --git a/k8s/cluster-config/machinesets/templates/zone-3/machineset.yaml b/k8s/cluster-config/machinesets/templates/zone-3/machineset.yaml index 5c10124..614f2bb 100644 --- a/k8s/cluster-config/machinesets/templates/zone-3/machineset.yaml +++ b/k8s/cluster-config/machinesets/templates/zone-3/machineset.yaml @@ -29,6 +29,9 @@ spec: sailfish/application: {{ .Values.application }} providerSpec: value: + {{- if .Values.enableSpotVM }} + spotVMOptions: {} + {{- end }} osDisk: diskSettings: {} diskSizeGB: 128 @@ -61,6 +64,5 @@ spec: apiVersion: azureproviderconfig.openshift.io/v1beta1 taints: - effect: NoSchedule - key: sailfish - value: {{ .Values.application }} - + key: application + value: sailfish-hpc diff --git a/k8s/sailfish/base/foundation/runner-autoscaler.yaml b/k8s/sailfish/base/foundation/runner-autoscaler.yaml index 79d36e9..52d5625 100644 --- a/k8s/sailfish/base/foundation/runner-autoscaler.yaml +++ b/k8s/sailfish/base/foundation/runner-autoscaler.yaml @@ -19,8 +19,8 @@ spec: sailfish/application: sailfish tolerations: - effect: NoSchedule - key: sailfish - value: sailfish + key: application + value: sailfish-hpc # Provide your own container that you'd like to run as a Job Manager containers: - name: task-runner diff --git a/k8s/sailfish/overlays/demo/run-manager-job.yaml b/k8s/sailfish/overlays/demo/run-manager-job.yaml index b3ed51e..84e3050 100644 --- a/k8s/sailfish/overlays/demo/run-manager-job.yaml +++ b/k8s/sailfish/overlays/demo/run-manager-job.yaml @@ -1,13 +1,6 @@ - op: replace path: "/spec/jobTargetRef/template/spec" value: - # Make sure to select the node that belongs to your solution. - nodeSelector: - sailfish/application: onboarding - tolerations: - - effect: NoSchedule - key: sailfish - value: onboarding # Provide your own container that you'd like to run as a Run Manager (splits Job into Tasks) containers: - name: run-manager diff --git a/k8s/sailfish/overlays/demo/runner-job.yaml b/k8s/sailfish/overlays/demo/runner-job.yaml index 3464c23..fda8667 100644 --- a/k8s/sailfish/overlays/demo/runner-job.yaml +++ b/k8s/sailfish/overlays/demo/runner-job.yaml @@ -4,10 +4,6 @@ # Make sure to select the node that belongs to your solution. nodeSelector: sailfish/application: onboarding - tolerations: - - effect: NoSchedule - key: sailfish - value: onboarding # Provide your own container that you'd like to run as a Runner (compute workload) containers: - name: task-runner diff --git a/sailfish-example/argocd/apps/machines.yaml b/sailfish-example/argocd/apps/machines.yaml index a187bf5..cd4b6ee 100644 --- a/sailfish-example/argocd/apps/machines.yaml +++ b/sailfish-example/argocd/apps/machines.yaml @@ -18,6 +18,8 @@ spec: targetRevision: 'main' helm: parameters: + - name: enableSpotVM + value: 'true' - name: maxMachinesPerZone value: '3' - name: clusterName