diff --git a/README.md b/README.md index cc63d2499..4a4e4930b 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ This repository is where [SquareFactory](https://www.squarefactory.io) develops - GitOps-enabled with [ArgoCD](https://docs.clusterfactory.io/docs/main-concepts/gitops/argocd) and [Sealed Secrets](https://docs.clusterfactory.io/docs/main-concepts/gitops/sealed-secrets) - VM workloads with KubeVirt - Bare-metal workloads with [Slurm](https://docs.clusterfactory.io/docs/main-concepts/apps/slurm) -- Bare-metal provisioning with [xCAT](https://docs.clusterfactory.io/docs/main-concepts/apps/xcat) +- Bare-metal provisioning with [Grendel](https://docs.clusterfactory.io/docs/main-concepts/apps/grendel) - Supports CNI plugins with [Multus CNI](https://docs.clusterfactory.io/docs/main-concepts/core-network/multus-cni) - TLS/SSL certificates management with [cert-manager](https://docs.clusterfactory.io/docs/main-concepts/gitops/cert-manager) - Mirror of DeepSquare's software library (end user software) by using [CVMFS Stratum 1](https://docs.clusterfactory.io/docs/main-concepts/apps/cvmfs) diff --git a/argo.example/ldap/volumes/.gitkeep b/argo.example/ldap/volumes/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/argo.example/ldap/volumes/389ds-at-vie-nfs.yaml.example b/argo.example/ldap/volumes/389ds-at-vie-nfs.yaml.example deleted file mode 100644 index 3d95f06f8..000000000 --- a/argo.example/ldap/volumes/389ds-at-vie-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: 389ds-at-vie-nfs - namespace: ldap - labels: - app: 389ds - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: 172.24.0.3 - share: /srv/nfs/k8s/389ds - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/ldap/volumes/openldap-nfs.yaml.example b/argo.example/ldap/volumes/openldap-nfs.yaml.example deleted file mode 100644 index 7234e41ac..000000000 --- a/argo.example/ldap/volumes/openldap-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: openldap-nfs - namespace: ldap - labels: - app: openldap - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/ldap - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/local-path-storage/app-project.yaml b/argo.example/local-path-storage/app-project.yaml new file mode 100644 index 000000000..bf3300cdc --- /dev/null +++ b/argo.example/local-path-storage/app-project.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: AppProject +metadata: + name: local-path-storage + namespace: argocd + # Finalizer that ensures that project is not deleted until it is not referenced by any application + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + description: Local Path Storage + # Allow manifests to deploy from any Git repos + sourceRepos: + - '*' + # Only permit applications to deploy to the namespace in the same cluster + destinations: + - namespace: local-path-storage + server: https://kubernetes.default.svc + + namespaceResourceWhitelist: + - kind: '*' + group: '*' + + clusterResourceWhitelist: + - kind: '*' + group: '*' diff --git a/argo.example/default/apps/local-path-provisioner-app.yaml b/argo.example/local-path-storage/apps/local-path-storage-app.yaml similarity index 91% rename from argo.example/default/apps/local-path-provisioner-app.yaml rename to argo.example/local-path-storage/apps/local-path-storage-app.yaml index ced684821..85417ce47 100644 --- a/argo.example/default/apps/local-path-provisioner-app.yaml +++ b/argo.example/local-path-storage/apps/local-path-storage-app.yaml @@ -1,21 +1,21 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: local-path-provisioner-app + name: local-path-storage-app namespace: argocd finalizers: - resources-finalizer.argocd.argoproj.io spec: - project: default + project: local-path-storage source: repoURL: https://github.com/rancher/local-path-provisioner.git targetRevision: v0.0.24 path: deploy - kustomize: + kustomize: {} destination: server: 'https://kubernetes.default.svc' - namespace: default + namespace: local-path-storage syncPolicy: automated: diff --git a/argo.example/local-path-storage/namespace.yaml b/argo.example/local-path-storage/namespace.yaml new file mode 100644 index 000000000..0539132fd --- /dev/null +++ b/argo.example/local-path-storage/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: local-path-storage + labels: + app.kubernetes.io/name: local-path-storage diff --git a/argo.example/monitoring/volumes/grafana-nfs.yaml.example b/argo.example/monitoring/volumes/grafana-nfs.yaml.example deleted file mode 100644 index d2c0dad5e..000000000 --- a/argo.example/monitoring/volumes/grafana-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: grafana-nfs - namespace: monitoring - labels: - app: grafana - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/grafana - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/monitoring/volumes/prometheus-nfs.yaml.example b/argo.example/monitoring/volumes/prometheus-nfs.yaml.example deleted file mode 100644 index 97b232763..000000000 --- a/argo.example/monitoring/volumes/prometheus-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: prometheus-nfs - namespace: monitoring - labels: - app: prometheus - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/prometheus - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/provisioning/apps/xcat-app.yaml b/argo.example/provisioning/apps/grendel-app.yaml similarity index 95% rename from argo.example/provisioning/apps/xcat-app.yaml rename to argo.example/provisioning/apps/grendel-app.yaml index ce8083248..4a3d21a4f 100644 --- a/argo.example/provisioning/apps/xcat-app.yaml +++ b/argo.example/provisioning/apps/grendel-app.yaml @@ -1,7 +1,7 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: xcat-app + name: grendel-app namespace: argocd finalizers: - resources-finalizer.argocd.argoproj.io @@ -12,9 +12,9 @@ spec: repoURL: git@github.com:squarefactory/ClusterFactory.git # You should use your branch too. targetRevision: HEAD - path: helm/xcat + path: helm/grendel helm: - releaseName: xcat + releaseName: grendel # Create a values file inside your fork and change the values. valueFiles: diff --git a/argo.example/provisioning/secrets/grendel-secret.yaml.example b/argo.example/provisioning/secrets/grendel-secret.yaml.example index 1b4423b5a..d6dfc1cf1 100644 --- a/argo.example/provisioning/secrets/grendel-secret.yaml.example +++ b/argo.example/provisioning/secrets/grendel-secret.yaml.example @@ -24,7 +24,7 @@ stringData: # By default, all loggers are on. You can turn off logging for specific # services here. # - loggers = {cli="on", tftp="off", dhcp="on", dns="off", provision="on", api="on", pxe="off"} + loggers = {cli="on", tftp="on", dhcp="on", dns="off", provision="on", api="on", pxe="on"} # # Admin ssh public keys. These are used in provision templates and elsewhere for @@ -104,7 +104,7 @@ stringData: # 10.17.40.0/23 and if so set the dhcp gateway/router to 10.17.41.254. # subnets = [ - {gateway = "10.10.2.1/24", dns="10.10.4.100", domainSearch="ch1.deepsquare.run", mtu="1500"} + {gateway = "192.168.0.1/24", dns="192.168.0.100", domainSearch="example.com", mtu="1500"} ] #------------------------------------------------------------------------------ diff --git a/argo.example/provisioning/secrets/postscript-privatekey-secret.yaml.local.example b/argo.example/provisioning/secrets/postscript-privatekey-secret.yaml.local.example new file mode 100644 index 000000000..451d3b952 --- /dev/null +++ b/argo.example/provisioning/secrets/postscript-privatekey-secret.yaml.local.example @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Secret +metadata: + name: postscript-privatekey-secret + namespace: provisioning +type: Opaque +stringData: + ## Create the key with: + ## ssh-keygen -f $(pwd)/key -C grendel + ## Encrypt with: + ## openssl enc -aes-256-cbc -a -salt -pbkdf2 -in key -out key.enc + key.enc: "" diff --git a/argo.example/provisioning/volumes/xcat-nfs.yaml.example b/argo.example/provisioning/volumes/xcat-nfs.yaml.example deleted file mode 100644 index 09960b602..000000000 --- a/argo.example/provisioning/volumes/xcat-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: xcat-nfs - namespace: provisioning - labels: - app: xcat - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/xcat - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/slurm-cluster/volumes/example/controller-state-nfs.yaml.example b/argo.example/slurm-cluster/volumes/example/controller-state-nfs.yaml.example deleted file mode 100644 index fb7b1ad71..000000000 --- a/argo.example/slurm-cluster/volumes/example/controller-state-nfs.yaml.example +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: controller-state-nfs - namespace: slurm-cluster - labels: - app: controller - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/slurm/controller - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion diff --git a/argo.example/slurm-cluster/volumes/example/ldap-users-pv.yaml.example b/argo.example/slurm-cluster/volumes/example/ldap-users-pv.yaml.example index 31794fc2e..3ca8ff1e1 100644 --- a/argo.example/slurm-cluster/volumes/example/ldap-users-pv.yaml.example +++ b/argo.example/slurm-cluster/volumes/example/ldap-users-pv.yaml.example @@ -2,7 +2,6 @@ apiVersion: v1 kind: PersistentVolume metadata: name: ldap-users-example-pv - namespace: slurm-cluster labels: app: slurm-login topology.kubernetes.io/region: ch-sion diff --git a/argo.example/harbor/volumes/harbor-nfs.yaml.example b/argo.example/volumes/dynamic-nfs.yaml similarity index 85% rename from argo.example/harbor/volumes/harbor-nfs.yaml.example rename to argo.example/volumes/dynamic-nfs.yaml index a572d5b71..0a0fd5c3f 100644 --- a/argo.example/harbor/volumes/harbor-nfs.yaml.example +++ b/argo.example/volumes/dynamic-nfs.yaml @@ -1,16 +1,14 @@ apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: - name: harbor-nfs - namespace: ldap + name: dynamic-nfs labels: - app: harbor topology.kubernetes.io/region: ch-sion topology.kubernetes.io/zone: ch-sion-1 provisioner: nfs.csi.k8s.io parameters: server: nfs.example.com - share: /srv/nfs/k8s/harbor + share: /srv/nfs/dynamic mountPermissions: '0775' mountOptions: - hard diff --git a/core.example/coredns/overlays/prod/configmap.yaml b/core.example/coredns/overlays/prod/configmap.yaml index 240230f08..ccadc4012 100644 --- a/core.example/coredns/overlays/prod/configmap.yaml +++ b/core.example/coredns/overlays/prod/configmap.yaml @@ -44,7 +44,7 @@ data: example.com.db: | 192.168.0.1 gateway.example.com 192.168.0.2 mn1.example.com - 192.168.0.3 xcatmn.example.com + 192.168.0.3 grendel.example.com 192.168.0.5 cvmfs.example.com 192.168.0.6 nfs.example.com 192.168.0.7 mysql.example.com diff --git a/helm/cvmfs-server/values-example.yaml b/helm/cvmfs-server/values-example.yaml deleted file mode 100644 index 179b3972f..000000000 --- a/helm/cvmfs-server/values-example.yaml +++ /dev/null @@ -1,55 +0,0 @@ -replicas: 1 - -dnsPolicy: 'None' -dnsConfig: - nameservers: - - 10.96.0.10 - options: - - name: ndots - value: '0' - -nodeSelector: - kubernetes.io/hostname: k0s-bare-ch-basel-1 - topology.kubernetes.io/region: ch-basel - topology.kubernetes.io/zone: ch-basel-1 - -config: - replicas: - - name: repo.example.com - url: https://cvmfs-0.example.com/cvmfs/repo.example.com - keys: /etc/cvmfs/keys/example.com - options: '-o root' - -volumeMounts: - - name: cvmfs-keys - mountPath: /etc/cvmfs/keys/example.com - readOnly: true - -volumes: - - name: cvmfs-keys - secret: - secretName: cvmfs-keys-secret - defaultMode: 256 - -state: - storageClassName: 'local-path' - -storage: - storageClassName: 'local-path' - -ingress: - enabled: true - annotations: - cert-manager.io/cluster-issuer: production-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.tls: 'true' - - ingressClass: 'traefik' - - hosts: - - cvmfs-1.example.com - - tls: - - secretName: cvmfs-1.example.com-secret - hosts: - - cvmfs-1.example.com diff --git a/helm/cvmfs-service/values-example.yaml b/helm/cvmfs-service/values-example.yaml deleted file mode 100644 index ceacf5f54..000000000 --- a/helm/cvmfs-service/values-example.yaml +++ /dev/null @@ -1,31 +0,0 @@ -dnsPolicy: 'None' -dnsConfig: - nameservers: - - 10.96.0.10 - searches: - - ch1.example.com - options: - - name: ndots - value: '0' - -repositories: - - name: software-sion-csquare-run - repository: software.sion.csquare.run - - name: unpacked-sion-csquare-run - repository: unpacked.sion.csquare.run - - name: stdenv-sion-csquare-run - repository: stdenv.sion.csquare.run - -configs: - default.local: - mountPath: default.local - contents: | - CVMFS_QUOTA_LIMIT=-1 - CVMFS_USE_GEOAPI=no - CVMFS_HTTP_PROXY="DIRECT" - CVMFS_KEYS_DIR="/etc/cvmfs/keys" - CVMFS_SERVER_URL="http://cvmfs.ch-sion-1.deepsquare.run/cvmfs/@fqrn@" - CVMFS_USER=root - -keys: - secretName: 'sion-csquare-run-keys-secret' diff --git a/helm/grendel/values.yaml b/helm/grendel/values.yaml index e028e6369..284cb50f4 100644 --- a/helm/grendel/values.yaml +++ b/helm/grendel/values.yaml @@ -31,7 +31,7 @@ config: # initrd: # - '/var/lib/grendel/ubuntu-focal-initramfs.img' # liveimg: '/var/lib/grendel/ubuntu-focal-squashfs.img' - # cmdline: console=ttyS0 console=tty0 root=live:http://grendel.internal/repo/ubuntu-focal-squashfs.img BOOTIF={{ $.nic.MAC }} ip=dhcp rd.live.overlay.readonly=1 rd.live.overlay.overlayfs=1 rd.neednet=1 + # cmdline: console=ttyS0 console=tty0 root=live:http://grendel.example.com/repo/ubuntu-focal-squashfs.img BOOTIF={{ $.nic.MAC }} ip=dhcp rd.live.overlay.readonly=1 rd.live.overlay.overlayfs=1 rd.neednet=1 ## This script is run on each node after booting ## @@ -42,7 +42,7 @@ config: set -ex # Fetch encrypted deploy key - curl --retry 5 -fsSL http://grendel.internal/repo/key.enc -o /key.enc + curl --retry 5 -fsSL http://grendel.example.com/repo/key.enc -o /key.enc chmod 600 /key.enc # Decrypt deploy key @@ -92,7 +92,7 @@ securityContext: runAsNonRoot: false runAsUser: 0 -# How long to wait for xcat to stop gracefully +# How long to wait for grendel to stop gracefully terminationGracePeriod: 10 ## Use an alternate scheduler. diff --git a/helm/openldap/Chart.yaml b/helm/openldap/Chart.yaml deleted file mode 100644 index 7ba568e73..000000000 --- a/helm/openldap/Chart.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v2 -name: openldap -description: xCAT Extreme Cloud Administration Toolkit, xCAT offers complete management for bare-metal based cluster. -type: application -home: https://www.openldap.org -sources: - - https://git.openldap.org/openldap/openldap -version: 0.1.0 -appVersion: '2.6.1' -maintainers: - - name: Marc Nguyen - email: marc@squarefactory.io - - name: Christophe Lillo - email: lillo@squarefactory.io diff --git a/helm/openldap/LICENSE b/helm/openldap/LICENSE deleted file mode 100644 index d01ec5b0e..000000000 --- a/helm/openldap/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2022 SquareFactory SA - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/helm/openldap/README.md b/helm/openldap/README.md deleted file mode 100644 index e46a02e9a..000000000 --- a/helm/openldap/README.md +++ /dev/null @@ -1,76 +0,0 @@ -# openldap - -![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.6.1](https://img.shields.io/badge/AppVersion-2.6.1-informational?style=flat-square) - -xCAT Extreme Cloud Administration Toolkit, xCAT offers complete management for bare-metal based cluster. - -**Homepage:** - -## Maintainers - -| Name | Email | Url | -| ---- | ------ | --- | -| Marc Nguyen | | | -| Christophe Lillo | | | - -## Source Code - -* - -## Values - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| annotations | object | `{}` | | -| env.LDAP_ADD_SCHEMAS | string | `"yes"` | | -| env.LDAP_ALLOW_ANON_BINDING | string | `"yes"` | | -| env.LDAP_CONFIG_ADMIN_ENABLED | string | `"no"` | | -| env.LDAP_CUSTOM_LDIF_DIR | string | `"/ldifs"` | | -| env.LDAP_CUSTOM_SCHEMA_FILE | string | `"/schema/custom.ldif"` | | -| env.LDAP_EXTRA_SCHEMAS | string | `"cosine,inetorgperson,nis"` | | -| env.LDAP_GROUP | string | `"readers"` | | -| env.LDAP_LOGLEVEL | string | `"256"` | | -| env.LDAP_ROOT | string | `"dc=example,dc=org"` | | -| env.LDAP_SKIP_DEFAULT_TREE | string | `"no"` | | -| env.LDAP_ULIMIT_NOFILES | string | `"1024"` | | -| env.LDAP_USER_DC | string | `"users"` | | -| envSecretName | string | `""` | | -| image | string | `"docker.io/bitnami/openldap:2.6.3"` | | -| imagePullPolicy | string | `"IfNotPresent"` | | -| imagePullSecrets | object | `{}` | | -| initContainers | list | `[]` | | -| labels | object | `{}` | | -| ldapPort | int | `1389` | | -| ldapsPort | int | `1636` | | -| livenessProbe.failureThreshold | int | `5` | | -| livenessProbe.initialDelaySeconds | int | `60` | | -| livenessProbe.periodSeconds | int | `10` | | -| livenessProbe.successThreshold | int | `1` | | -| livenessProbe.timeoutSeconds | int | `10` | | -| nodeAffinity | object | `{}` | | -| nodeSelector | object | `{}` | | -| persistence.accessModes[0] | string | `"ReadWriteOnce"` | | -| persistence.selectorLabels | object | `{}` | | -| persistence.size | string | `"5Gi"` | | -| persistence.storageClassName | string | `""` | | -| readinessProbe.failureThreshold | int | `5` | | -| readinessProbe.periodSeconds | int | `10` | | -| readinessProbe.successThreshold | int | `1` | | -| readinessProbe.timeoutSeconds | int | `10` | | -| replicas | int | `1` | | -| resources.limits.memory | string | `"256Mi"` | | -| resources.requests.cpu | string | `"125m"` | | -| resources.requests.memory | string | `"256Mi"` | | -| schedulerName | string | `""` | | -| service.enabled | bool | `true` | | -| service.type | string | `"ClusterIP"` | | -| terminationGracePeriod | int | `10` | | -| tls.enabled | bool | `false` | | -| tls.secretName | string | `""` | | -| tolerations | list | `[]` | | -| updateStrategy.type | string | `"RollingUpdate"` | | -| volumeMounts | list | `[]` | | -| volumes | list | `[]` | | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/helm/openldap/templates/_helpers.tpl b/helm/openldap/templates/_helpers.tpl deleted file mode 100644 index 72eb513ef..000000000 --- a/helm/openldap/templates/_helpers.tpl +++ /dev/null @@ -1,7 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "openldap.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/helm/openldap/templates/configmap.yaml b/helm/openldap/templates/configmap.yaml deleted file mode 100644 index c4109b982..000000000 --- a/helm/openldap/templates/configmap.yaml +++ /dev/null @@ -1,11 +0,0 @@ -kind: ConfigMap -apiVersion: v1 -metadata: - name: "{{ template "openldap.name" . }}-env" - namespace: '{{ .Release.Namespace }}' - labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' - app: "{{ template "openldap.name" . }}" -data: -{{ toYaml .Values.env | indent 2 }} diff --git a/helm/openldap/templates/service.yaml b/helm/openldap/templates/service.yaml deleted file mode 100644 index 8816352b8..000000000 --- a/helm/openldap/templates/service.yaml +++ /dev/null @@ -1,48 +0,0 @@ -{{- if .Values.service.enabled }} -{{- $serviceValues := .Values.service -}} -apiVersion: v1 -kind: Service -metadata: - name: "{{ template "openldap.name" $ }}" - namespace: '{{ $.Release.Namespace }}' - labels: - release: '{{ $.Release.Name }}' - chart: '{{ $.Chart.Name }}' - app: "{{ template "openldap.name" $ }}" -spec: - type: {{ $serviceValues.type }} - {{- if $serviceValues.clusterIP }} - clusterIP: {{ $serviceValues.clusterIP }} - {{- end }} - {{- if $serviceValues.externalIPs }} - externalIPs: - {{ toYaml $serviceValues.externalIPs | indent 4 }} - {{- end }} - {{- if $serviceValues.loadBalancerIP }} - loadBalancerIP: {{ $serviceValues.loadBalancerIP }} - {{- end }} - {{- if $serviceValues.loadBalancerSourceRanges }} - loadBalancerSourceRanges: - {{- range $cidr := $serviceValues.loadBalancerSourceRanges }} - - {{ $cidr }} - {{- end }} - {{- end }} - {{- if ne $serviceValues.type "ClusterIP" }} - externalTrafficPolicy: {{ $serviceValues.externalTrafficPolicy }} - {{- end }} - ports: - - port: {{ .Values.ldapPort }} - targetPort: ldap - protocol: TCP - name: ldap - {{- if .Values.tls.enabled }} - - port: {{ .Values.ldapsPort }} - targetPort: ldaps - protocol: TCP - name: ldaps - {{- end }} - selector: - app: "{{ template "openldap.name" $ }}" - app.kubernetes.io/name: "{{ template "openldap.name" $ }}" - app.kubernetes.io/instance: "{{ template "openldap.name" $ }}" -{{- end }} diff --git a/helm/openldap/templates/statefulset.yaml b/helm/openldap/templates/statefulset.yaml deleted file mode 100644 index dfca07425..000000000 --- a/helm/openldap/templates/statefulset.yaml +++ /dev/null @@ -1,169 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: "{{ template "openldap.name" . }}" - namespace: '{{ .Release.Namespace }}' - labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' - app: "{{ template "openldap.name" . }}" -spec: - serviceName: "{{ template "openldap.name" . }}" - selector: - matchLabels: - app: "{{ template "openldap.name" . }}" - app.kubernetes.io/name: "{{ template "openldap.name" . }}" - app.kubernetes.io/instance: "{{ template "openldap.name" . }}" - replicas: {{ .Values.replicas }} - updateStrategy: -{{ toYaml .Values.updateStrategy | trim | indent 4 }} - template: - metadata: - name: "{{ template "openldap.name" . }}" - labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" - app: "{{ template "openldap.name" . }}" - app.kubernetes.io/name: "{{ template "openldap.name" . }}" - app.kubernetes.io/instance: "{{ template "openldap.name" . }}" -{{- if .Values.labels }} -{{ toYaml .Values.labels | indent 8 }} -{{- end }} -{{- if .Values.annotations }} - annotations: -{{ toYaml .Values.annotations | indent 8 }} -{{- end }} - spec: - {{- if .Values.schedulerName }} - schedulerName: "{{ .Values.schedulerName }}" - {{- end }} - {{- with .Values.tolerations }} - tolerations: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.nodeAffinity }} - affinity: -{{ toYaml . | indent 8 }} - {{- end }} - terminationGracePeriodSeconds: {{ .Values.terminationGracePeriod }} - {{- if .Values.imagePullSecrets }} - imagePullSecrets: -{{ toYaml .Values.imagePullSecrets | indent 8 }} - {{- end }} - initContainers: - {{- if .Values.tls.enabled }} - - name: "init-perms" - image: busybox:1.34.1 - command: ["sh", "-c"] - args: - - |- - cp -RLp /in/* /out/; - find /out -type f -exec chmod 600 {} \;; - chown -R 1001:1001 /out/; - volumeMounts: - - name: openldap-certs - mountPath: /in - - name: openldap-certs-with-perms - mountPath: /out - {{- end }} - {{- if .Values.initContainers }} -{{ toYaml .Values.initContainers | indent 8 }} - {{- end }} - {{- if .Values.dnsPolicy }} - dnsPolicy: "{{ .Values.dnsPolicy }}" - {{- end }} - {{- if .Values.dnsConfig }} - dnsConfig: -{{ toYaml .Values.dnsConfig | indent 8 }} - {{- end }} - containers: - - name: "{{ template "openldap.name" . }}" - image: "{{ .Values.image }}" - imagePullPolicy: "{{ .Values.imagePullPolicy }}" - ports: - - name: ldap - containerPort: {{ .Values.ldapPort }} - {{- if .Values.tls.enabled }} - - name: ldaps - containerPort: {{ .Values.ldapsPort }} - {{- end }} - envFrom: - - configMapRef: - name: "{{ template "openldap.name" . }}-env" - {{- if .Values.envSecretName }} - - secretRef: - name: "{{ .Values.envSecretName }}" - {{- end }} - env: - - name: LDAP_PORT_NUMBER - value: "{{ .Values.ldapPort }}" - - name: LDAP_ENABLE_TLS - value: "{{ ternary "yes" "no" .Values.tls.enabled }}" - {{- if .Values.tls.enabled }} - - name: LDAP_LDAPS_PORT_NUMBER - value: "{{ .Values.ldapsPort }}" - - name: LDAP_TLS_CERT_FILE - value: "/opt/bitnami/openldap/certs/tls.crt" - - name: LDAP_TLS_KEY_FILE - value: "/opt/bitnami/openldap/certs/tls.key" - - name: LDAP_TLS_CA_FILE - value: "/opt/bitnami/openldap/certs/ca.crt" - {{- end }} - readinessProbe: - exec: - command: - - sh - - -c - - >- - ldapsearch - -H ldap://127.0.0.1:{{ .Values.ldapPort }} - -D cn=${LDAP_ADMIN_USERNAME:-"admin"},${LDAP_ROOT:-"dc=example,dc=org"} - -b ${LDAP_ROOT:-"dc=example,dc=org"} - -w ${LDAP_ADMIN_PASSWORD:-"adminpassword"} -{{ toYaml .Values.readinessProbe | indent 12 }} - livenessProbe: - tcpSocket: - port: ldap -{{ toYaml .Values.livenessProbe | indent 12 }} - volumeMounts: - - name: openldap-data - mountPath: /bitnami/openldap/ - {{- if .Values.tls.enabled }} - - name: openldap-certs-with-perms - mountPath: /opt/bitnami/openldap/certs/ - {{- end }} - {{- if .Values.volumeMounts }} -{{ toYaml .Values.volumeMounts | indent 12 }} - {{- end }} - resources: -{{ toYaml .Values.resources | indent 12 }} - volumes: - {{- if .Values.tls.enabled }} - - name: openldap-certs - secret: - secretName: "{{ .Values.tls.secretName }}" - defaultMode: 256 - - name: openldap-certs-with-perms - emptyDir: {} - {{- end }} - {{- if .Values.volumes }} -{{ toYaml .Values.volumes | indent 8 }} - {{- end }} - volumeClaimTemplates: - - metadata: - name: openldap-data - spec: - accessModes: {{ .Values.persistence.accessModes }} - storageClassName: "{{ .Values.persistence.storageClassName }}" - resources: - requests: - storage: {{ .Values.persistence.size }} - {{- with .Values.persistence.selectorLabels }} - selector: - matchLabels: -{{ toYaml . | indent 12 }} - {{- end }} diff --git a/helm/openldap/values-example.yaml b/helm/openldap/values-example.yaml deleted file mode 100644 index be467ee6f..000000000 --- a/helm/openldap/values-example.yaml +++ /dev/null @@ -1,23 +0,0 @@ -replicas: 1 - -nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - -env: - BITNAMI_DEBUG: 'true' - LDAP_ROOT: 'dc=example,dc=org' - LDAP_CONFIG_ADMIN_ENABLED: 'no' - LDAP_USER_DC: 'users' - LDAP_GROUP: 'readers' - LDAP_ADD_SCHEMAS: 'yes' - LDAP_EXTRA_SCHEMAS: 'cosine,inetorgperson,nis' - LDAP_SKIP_DEFAULT_TREE: 'no' - LDAP_CUSTOM_LDIF_DIR: '/ldifs' - LDAP_CUSTOM_SCHEMA_FILE: '/schema/custom.ldif' - LDAP_ULIMIT_NOFILES: '1024' - LDAP_ALLOW_ANON_BINDING: 'yes' - LDAP_LOGLEVEL: '256' - -persistence: - storageClassName: 'openldap-nfs' diff --git a/helm/openldap/values.yaml b/helm/openldap/values.yaml deleted file mode 100644 index aedfc626e..000000000 --- a/helm/openldap/values.yaml +++ /dev/null @@ -1,101 +0,0 @@ -replicas: 1 - -image: 'docker.io/bitnami/openldap:2.6.4' -imagePullPolicy: 'IfNotPresent' - -labels: {} -annotations: {} - -ldapPort: 1389 -ldapsPort: 1636 - -resources: - requests: - cpu: 125m - memory: 256Mi - limits: - memory: 256Mi - -nodeAffinity: {} - -updateStrategy: - type: RollingUpdate - -livenessProbe: - initialDelaySeconds: 60 - timeoutSeconds: 10 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 5 -readinessProbe: - timeoutSeconds: 10 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 5 - -# How long to wait to stop gracefully -terminationGracePeriod: 10 - -## Use an alternate scheduler. -## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ -## -schedulerName: '' - -imagePullSecrets: {} - -nodeSelector: {} -tolerations: [] - -# persistence is mounted on /bitnami/openldap/ -persistence: - storageClassName: '' - accessModes: ['ReadWriteOnce'] - size: 5Gi - selectorLabels: {} - -initContainers: [] - -## By enabling tls, LDAP_TLS_* env variables will be automatically set. -## The format of the secret MUST be kubernetes.io/tls (ca.crt, tls.crt, tls.key). -## ref: https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets -tls: - enabled: false - secretName: '' - -## Defaults imported from https://hub.docker.com/r/bitnami/openldap/ -## Do not add sensitive informations here, prefer secret. -env: - LDAP_ROOT: 'dc=example,dc=org' - LDAP_CONFIG_ADMIN_ENABLED: 'no' - LDAP_USER_DC: 'users' - LDAP_GROUP: 'readers' - LDAP_ADD_SCHEMAS: 'yes' - LDAP_EXTRA_SCHEMAS: 'cosine,inetorgperson,nis' - LDAP_SKIP_DEFAULT_TREE: 'no' - LDAP_CUSTOM_LDIF_DIR: '/ldifs' - LDAP_CUSTOM_SCHEMA_FILE: '/schema/custom.ldif' - LDAP_ULIMIT_NOFILES: '1024' - LDAP_ALLOW_ANON_BINDING: 'yes' - LDAP_LOGLEVEL: '256' - -# Use envSecretName to import password and other sensitive env variables -envSecretName: '' - -# Extra volume mounts -volumeMounts: - [] - # - name: custom-ldifs - # mountPath: /ldifs - # readOnly: true - -# Extra volumes (use it to mount secrets like ldif) -volumes: - [] - # - name: custom-ldifs - # configMap: - # name: custom-ldifs - # defaultMode: 256 - -service: - enabled: true - type: ClusterIP diff --git a/helm/slurm-cluster/values-example.yaml b/helm/slurm-cluster/values-example.yaml deleted file mode 100644 index 7a382bf81..000000000 --- a/helm/slurm-cluster/values-example.yaml +++ /dev/null @@ -1,263 +0,0 @@ -sssd: - # secret containing sssd.conf - # Will be mounted in /secrets/sssd - secretName: sssd-secret - -munge: - # secret containing munge.key - # Will be mounted in /secrets/munge - secretName: munge-secret - -# secret containing jwt_hs256.key -# Will be mounted in /secrets/slurm -jwt: - secretName: slurm-secret - -slurmConfig: - clusterName: example - - compute: - srunPortRangeStart: 60001 - srunPortRangeEnd: 63000 - debug: debug5 - - controller: - parameters: enable_configless - debug: debug5 - - accounting: | - AccountingStorageType=accounting_storage/slurmdbd - AccountingStorageHost=slurm-cluster-example-db.slurm-cluster.svc.cluster.local - AccountingStoragePort=6819 - AccountingStorageTRES=gres/gpu - AccountingStoreFlags=job_comment,job_env,job_script - # AccountingStorageEnforce=associations,limits,qos - - defaultResourcesAllocation: | - DefCpuPerGPU=8 - DefMemPerCpu=600 - - scheduling: | - SchedulerParameters=salloc_wait_nodes,sbatch_wait_nodes,batch_sched_delay=15 - SchedulerType=sched/backfill - SelectType=select/cons_tres - SelectTypeParameters=CR_CPU_Memory - SchedulerTimeSlice=60 - UnkillableStepTimeout=300 - - priorities: | - # PriorityType=priority/multifactor - # PriorityFavorSmall=NO - # PriorityWeightAge=1000 - # PriorityWeightFairshare=10000 - # PriorityWeightTRES=CPU=1000,Mem=2000,GRES/gpu=8000 - # PriorityWeightJobSize=1000 - # PriorityWeightPartition=1000 - # PriorityWeightQOS=1000 - # PriorityDecayHalfLife=0 - # PriorityUsageResetPeriod=MONTHLY - - nodes: | - NodeName=cn1 CPUs=32 Boards=1 SocketsPerBoard=1 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=128473 Gres=gpu:4 - - partitions: | - PartitionName=main Nodes=cn1 Default=YES MaxTime=INFINITE State=UP OverSubscribe=EXCLUSIVE - - gres: | - NodeName=cn1 Name=gpu File=/dev/nvidia[0-3] - - # Extra slurm.conf configuration - extra: | - LaunchParameters=enable_nss_slurm - DebugFlags=Script,Gang,SelectType - TCPTimeout=5 - CommunicationParameters=NoAddrCache - - # MPI stacks running over Infiniband or OmniPath require the ability to allocate more - # locked memory than the default limit. Unfortunately, user processes on login nodes - # may have a small memory limit (check it by ulimit -a) which by default are propagated - # into Slurm jobs and hence cause fabric errors for MPI. - PropagateResourceLimitsExcept=MEMLOCK - - ProctrackType=proctrack/cgroup - TaskPlugin=task/cgroup - SwitchType=switch/none - MpiDefault=pmix_v2 - ReturnToService=2 #temp - GresTypes=gpu - PreemptType=preempt/qos - PreemptMode=REQUEUE - PreemptExemptTime=-1 - Prolog=/etc/slurm/prolog.d/* - Epilog=/etc/slurm/epilog.d/* - - # Federation - FederationParameters=fed_display - -controller: - replicas: 1 - - command: ['sh', '-c', 'update-ca-trust && /init'] - - persistence: - storageClassName: 'controller-state-nfs' - accessModes: ['ReadWriteOnce'] - size: 50Gi - selectorLabels: - app: slurm-controller - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - - prologsConfigMap: slurmctl-example-prologs - epilogsConfigMap: slurmctl-example-epilogs - - nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - - resources: - requests: - cpu: '100m' - memory: '256Mi' - limits: - memory: '1Gi' - - dnsPolicy: 'None' - dnsConfig: - nameservers: - - 10.96.0.10 - searches: - - slurm-cluster.svc.cluster.local - - example.com - options: - - name: ndots - value: '0' - - # Extra volume mounts - volumeMounts: - - name: ca-cert - mountPath: /etc/pki/ca-trust/source/anchors - - # Extra volumes - volumes: - - name: ca-cert - secret: - secretName: local-ca-secret - - # Extra volume claims - volumeClaimTemplates: [] - - servicePerReplica: - port: 6817 - type: ClusterIP - -login: - enabled: true - replicas: 2 - - strategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - - command: ['sh', '-c', 'update-ca-trust && /init'] - - sshd: - secretName: login-sshd-secret - - nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - - dnsPolicy: 'None' - dnsConfig: - nameservers: - - 10.96.0.10 - searches: - - slurm-cluster.svc.cluster.local - - example.com - options: - - name: ndots - value: '0' - - resources: - requests: - cpu: '100m' - memory: '256Mi' - limits: - memory: '1Gi' - - # Extra volume mounts - volumeMounts: - - name: ca-cert - mountPath: /etc/pki/ca-trust/source/anchors - - name: ldap-users-pvc - mountPath: /home/ldap-users - - # Extra volumes - volumes: - - name: ca-cert - secret: - secretName: local-ca-secret - - name: ldap-users-pvc - persistentVolumeClaim: - claimName: ldap-users-example-pvc - net: - # Kubernetes host interface - masterInterface: priv0 - mode: l2 - type: ipvlan - - # https://www.cni.dev/plugins/current/ipam/static/ - ipam: - type: host-local - ranges: - - - subnet: 192.168.0.0/24 - rangeStart: 192.168.0.20 - rangeEnd: 192.168.0.21 - gateway: 192.168.0.1 - - rest: - enabled: true - command: ['sh', '-c', 'update-ca-trust && /init'] - - resources: - requests: - cpu: '100m' - memory: '128Mi' - limits: - memory: '256Mi' - - # Extra volume mounts - volumeMounts: - - name: ca-cert - mountPath: /etc/pki/ca-trust/source/anchors - -db: - enabled: true - - command: ['sh', '-c', 'update-ca-trust && /init'] - - config: - secretName: 'slurmdbd-example-conf-secret' - - resources: - requests: - cpu: '100m' - memory: '128Mi' - limits: - memory: '256Mi' - - volumeMounts: - - name: ca-cert - mountPath: /etc/pki/ca-trust/source/anchors - - volumes: - - name: ca-cert - secret: - secretName: local-ca-secret - - nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 diff --git a/helm/squid/values-example.yaml b/helm/squid/values-example.yaml deleted file mode 100644 index dd0996ca0..000000000 --- a/helm/squid/values-example.yaml +++ /dev/null @@ -1,14 +0,0 @@ -config: | - acl local_nodes src YOUR_CLIENT_IPS - acl stratum_ones dst cvmfs.ch1.deepsquare.run - http_port 3128 - http_access allow stratum_ones - http_access allow local_nodes - http_access allow localhost - http_access deny all - collapsed_forwarding on - minimum_expiry_time 0 - maximum_object_size 1024 MB - cache_mem 128 MB - maximum_object_size_in_memory 128 KB - cache_dir ufs /var/spool/squid 50000 16 256 diff --git a/helm/xcat/.helmignore b/helm/xcat/.helmignore deleted file mode 100644 index 0e8a0eb36..000000000 --- a/helm/xcat/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/helm/xcat/Chart.yaml b/helm/xcat/Chart.yaml deleted file mode 100644 index a23977c36..000000000 --- a/helm/xcat/Chart.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v2 -name: xcat -description: xCAT Extreme Cloud Administration Toolkit, xCAT offers complete management for bare-metal based cluster. -type: application -home: https://github.com/squarefactory/xcat-rocky -icon: https://raw.githubusercontent.com/xcat2/xcat2.github.io/master/webcontent/assets/logo.jpeg -sources: - - https://github.com/squarefactory/xcat-rocky -version: 0.1.0 -appVersion: '0.1.3' -maintainers: - - name: Marc Nguyen - email: marc@squarefactory.io - - name: Christophe Lillo - email: lillo@squarefactory.io diff --git a/helm/xcat/LICENSE b/helm/xcat/LICENSE deleted file mode 100644 index 42a282337..000000000 --- a/helm/xcat/LICENSE +++ /dev/null @@ -1,207 +0,0 @@ - -Portions of this software are licensed as follows: - -* All content residing under the "web/" directory of this repository is licensed under "Creative Commons Attribution Share Alike 4.0 International" (CC-BY-SA-4.0). See docs/LICENCE for details. -* Content outside of the above mentioned directories or restrictions above is available under the "Apache License 2.0" as defined below. - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2022 SquareFactory SA - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/helm/xcat/README.md b/helm/xcat/README.md deleted file mode 100644 index 913fad502..000000000 --- a/helm/xcat/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# xcat - -![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.3](https://img.shields.io/badge/AppVersion-0.1.3-informational?style=flat-square) - -xCAT Extreme Cloud Administration Toolkit, xCAT offers complete management for bare-metal based cluster. - -**Homepage:** - -## Maintainers - -| Name | Email | Url | -| ---- | ------ | --- | -| Marc Nguyen | | | -| Christophe Lillo | | | - -## Source Code - -* - -## Values - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| image | string | `"ghcr.io/squarefactory/xcat-rocky:0.1.3-xcat2.16.3-rocky8.4"` | | -| imagePullPolicy | string | `"IfNotPresent"` | | -| imagePullSecrets | object | `{}` | | -| labels | object | `{}` | | -| net.dns.nameservers[0] | string | `"127.0.0.1"` | | -| net.ipam.addresses[0].address | string | `"192.168.0.3/24"` | | -| net.ipam.addresses[0].gateway | string | `"192.168.0.1"` | | -| net.ipam.routes[0].dst | string | `"0.0.0.0/0"` | | -| net.ipam.type | string | `"static"` | | -| net.masterInterface | string | `"eth0"` | | -| net.mode | string | `"l2"` | | -| net.type | string | `"ipvlan"` | | -| nodeAffinity | object | `{}` | | -| nodeSelector | object | `{}` | | -| persistence.accessModes[0] | string | `"ReadWriteOnce"` | | -| persistence.selectorLabels | object | `{}` | | -| persistence.size | string | `"50Gi"` | | -| persistence.storageClassName | string | `""` | | -| podAnnotations | object | `{}` | | -| podSecurityContext.runAsUser | int | `0` | | -| replicas | int | `1` | | -| resources.limits.cpu | string | `"2"` | | -| resources.limits.memory | string | `"8Gi"` | | -| resources.requests.cpu | string | `"2"` | | -| resources.requests.memory | string | `"8Gi"` | | -| schedulerName | string | `""` | | -| securityContext.capabilities.add[0] | string | `"CAP_SYS_ADMIN"` | | -| securityContext.capabilities.add[1] | string | `"NET_ADMIN"` | | -| securityContext.readOnlyRootFilesystem | bool | `false` | | -| securityContext.runAsNonRoot | bool | `false` | | -| securityContext.runAsUser | int | `0` | | -| terminationGracePeriod | int | `10` | | -| tmp.medium | string | `""` | | -| tmp.size | string | `"50Gi"` | | -| tolerations | list | `[]` | | -| updateStrategy | string | `"RollingUpdate"` | | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/helm/xcat/templates/_helpers.tpl b/helm/xcat/templates/_helpers.tpl deleted file mode 100644 index f14730832..000000000 --- a/helm/xcat/templates/_helpers.tpl +++ /dev/null @@ -1,16 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "xcat.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "xcat.fullname" -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/helm/xcat/templates/network-attachment-definition.yaml b/helm/xcat/templates/network-attachment-definition.yaml deleted file mode 100644 index 8f11ff841..000000000 --- a/helm/xcat/templates/network-attachment-definition.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: 'k8s.cni.cncf.io/v1' -kind: NetworkAttachmentDefinition -metadata: - name: {{ template "xcat.name" . }}-net - namespace: {{ .Release.Namespace }} - labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" -spec: - config: | - { - "cniVersion": "0.4.0", - "type": "{{ .Values.net.type }}", - "master": "{{ .Values.net.masterInterface }}", - "mode": "{{ .Values.net.mode }}", - "ipam": {{ .Values.net.ipam | toJson }} - } diff --git a/helm/xcat/templates/statefulset.yaml b/helm/xcat/templates/statefulset.yaml deleted file mode 100644 index b8eca6455..000000000 --- a/helm/xcat/templates/statefulset.yaml +++ /dev/null @@ -1,190 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: {{ template "xcat.name" . }} - namespace: {{ .Release.Namespace }} - labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" -spec: - serviceName: {{ template "xcat.name" . }} - selector: - matchLabels: - app: "{{ template "xcat.name" . }}" - replicas: {{ .Values.replicas }} - podManagementPolicy: {{ .Values.podManagementPolicy }} - updateStrategy: - type: {{ .Values.updateStrategy }} - template: - metadata: - name: "{{ template "xcat.name" . }}" - labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" - app: "{{ template "xcat.name" . }}" - {{- range $key, $value := .Values.labels }} - {{ $key }}: {{ $value | quote }} - {{- end }} - annotations: - v1.multus-cni.io/default-network: "{{ .Release.Namespace }}/{{ template "xcat.name" . }}-net" - {{- range $key, $value := .Values.podAnnotations }} - {{ $key }}: {{ $value | quote }} - {{- end }} - spec: - {{- if .Values.schedulerName }} - schedulerName: "{{ .Values.schedulerName }}" - {{- end }} - securityContext: -{{ toYaml .Values.podSecurityContext | indent 8 }} - {{- with .Values.net.dns }} - dnsPolicy: "None" - dnsConfig: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.hostAliases }} - hostAliases: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.nodeAffinity }} - affinity: -{{ toYaml . | indent 8 }} - {{- end }} - terminationGracePeriodSeconds: {{ .Values.terminationGracePeriod }} - {{- if .Values.imagePullSecrets }} - imagePullSecrets: -{{ toYaml .Values.imagePullSecrets | indent 8 }} - {{- end }} - containers: - - name: "{{ template "xcat.name" . }}" - securityContext: -{{ toYaml .Values.securityContext | indent 12 }} - image: "{{ .Values.image }}" - imagePullPolicy: "{{ .Values.imagePullPolicy }}" - ports: - - name: xcatdport-tcp - containerPort: 3001 - protocol: TCP - - name: xcatdport-udp - containerPort: 3001 - protocol: UDP - - name: xcatiport-tcp - containerPort: 3002 - protocol: TCP - - name: xcatiport-udp - containerPort: 3002 - protocol: UDP - - name: echo-udp - containerPort: 7 - protocol: UDP - - name: rsync-tcp - containerPort: 873 - protocol: TCP - - name: rsync-udp - containerPort: 873 - protocol: UDP - - name: domain-tcp - containerPort: 53 - protocol: TCP - - name: domain-udp - containerPort: 53 - protocol: UDP - - name: bootps - containerPort: 67 - protocol: UDP - - name: dhcp - containerPort: 67 - protocol: TCP - - name: dhcpc - containerPort: 68 - protocol: TCP - - name: bootpc - containerPort: 68 - protocol: UDP - - name: tftp-tcp - containerPort: 69 - protocol: TCP - - name: tftp-udp - containerPort: 69 - protocol: UDP - - name: www-tcp - containerPort: 80 - protocol: TCP - - name: www-udp - containerPort: 80 - protocol: UDP - - name: sunrpc-udp - containerPort: 111 - protocol: UDP - - name: rsyslogd-tcp - containerPort: 514 - protocol: TCP - - name: rsyslogd-udp - containerPort: 514 - protocol: UDP - - name: pxe - containerPort: 4011 - protocol: TCP - - name: ipmi-tcp - containerPort: 623 - protocol: TCP - - name: ipmi-udp - containerPort: 623 - protocol: UDP - - name: ssh-tcp - containerPort: 2200 - protocol: TCP - - name: ssh-udp - containerPort: 2200 - protocol: UDP - volumeMounts: - - name: xcatdata - mountPath: /xcatdata - - name: cgroup - mountPath: /sys/fs/cgroup - readOnly: true - - name: varlogxcat - mountPath: /var/log/xcat - - mountPath: /tmp - name: tmp - subPath: tmp - - mountPath: /run - name: tmp - subPath: run - - mountPath: /run/lock - name: tmp - subPath: run-lock - resources: -{{ toYaml .Values.resources | indent 12 }} - volumes: - - name: varlogxcat - emptyDir: {} - - name: cgroup - hostPath: - path: /sys/fs/cgroup - type: Directory - - name: tmp - emptyDir: - medium: {{ .Values.tmp.medium }} - sizeLimit: {{ .Values.tmp.size }} - volumeClaimTemplates: - - metadata: - name: xcatdata - spec: - accessModes: {{ .Values.persistence.accessModes }} - storageClassName: {{ .Values.persistence.storageClassName }} - resources: - requests: - storage: {{ .Values.persistence.size }} - {{- with .Values.persistence.selectorLabels }} - selector: - matchLabels: -{{ toYaml . | indent 12 }} - {{- end }} diff --git a/helm/xcat/values-example.yaml b/helm/xcat/values-example.yaml deleted file mode 100644 index bcbe018cc..000000000 --- a/helm/xcat/values-example.yaml +++ /dev/null @@ -1,37 +0,0 @@ -nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - -resources: - requests: - cpu: '250m' - memory: '8Gi' - limits: - cpu: '8' - memory: '8Gi' - -persistence: - storageClassName: 'xcat-nfs' - accessModes: ['ReadWriteOnce'] - size: 50Gi - -net: - # Kubernetes host interface - masterInterface: eno2 - mode: l2 - type: ipvlan - - # https://www.cni.dev/plugins/current/ipam/static/ - ipam: - type: static - addresses: - - address: 192.168.0.3/24 - gateway: 192.168.0.1 - routes: - - dst: 0.0.0.0/0 - - dns: - nameservers: - - 127.0.0.1 - searches: - - example.com diff --git a/helm/xcat/values.yaml b/helm/xcat/values.yaml deleted file mode 100644 index f2cf88b94..000000000 --- a/helm/xcat/values.yaml +++ /dev/null @@ -1,82 +0,0 @@ ---- -replicas: 1 - -image: 'ghcr.io/squarefactory/xcat-rocky:0.2.3-xcat2.16.3-rocky8.4' -imagePullPolicy: 'IfNotPresent' - -podAnnotations: {} - -# additionals labels -labels: {} - -resources: - requests: - cpu: '2' - memory: '8Gi' - limits: - cpu: '2' - memory: '8Gi' - -# This is the node affinity settings as defined in -# https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity-beta-feature -nodeAffinity: {} - -updateStrategy: RollingUpdate - -podSecurityContext: - runAsUser: 0 - -securityContext: - capabilities: - add: - - CAP_SYS_ADMIN - - NET_ADMIN - readOnlyRootFilesystem: false - runAsNonRoot: false - runAsUser: 0 - -# How long to wait for xcat to stop gracefully -terminationGracePeriod: 10 - -## Use an alternate scheduler. -## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ -## -schedulerName: '' - -imagePullSecrets: {} - -# Try to use nodeSelector to filter the hosts without the master interface -nodeSelector: {} -tolerations: [] - -tmp: - medium: '' - size: 50Gi - -persistence: - storageClassName: '' - accessModes: ['ReadWriteOnce'] - size: 50Gi - selectorLabels: - {} - # app: xcat - -net: - # Kubernetes host interface - masterInterface: eth0 - mode: l2 - type: ipvlan - - # https://www.cni.dev/plugins/current/ipam/static/ - ipam: - type: static - addresses: - - address: 192.168.0.3/24 - gateway: 192.168.0.1 - routes: - - dst: 0.0.0.0/0 - - # https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config - dns: - nameservers: - - 127.0.0.1 diff --git a/packer-recipes/README.md b/packer-recipes/README.md deleted file mode 100644 index 051dd2e51..000000000 --- a/packer-recipes/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Packer recipes repositories for xCAT and other... - -## TL;DR with Rocky - -```sh -# Post provisionners need sudo -sudo packer build rocky.json -``` - -## Deploy to xCAT - -**Via NFS** : - -```bash -# xCAT nfs mount -export MOUNT=nfs.example.com:/mnt/pool1/k8s/xcat -# xCAT root img -export EXPORT_PATH=/install/netboot/rocky8.6/x86_64/compute/rootimg/ - -source ./scripts-local/setup-nbd -source ./scripts-local/rsync-to-nfs -source ./scripts-local/teardown-nbd -``` - -**Via rsync(sftp/scp)** : - -```bash -# xCAT ssh address -export XCAT_SERVER=root@nfs.example.com -# xCAT root img -export EXPORT_PATH=/install/netboot/rocky8.6/x86_64/compute/rootimg/ - -source ./scripts-local/setup-nbd -source ./scripts-local/rsync-to-xcat -source ./scripts-local/teardown-nbd -``` - -## Notes - -`compute.bare.json` creates bare metal image with all the stuff required to run on our infrastructure (NVIDIA, Infiniband, ...). - -`compute.cloud.json` creates 2 cloud images, one with NVIDIA, and one without. diff --git a/packer-recipes/rocky8.6/http/ks.bare.cfg b/packer-recipes/rocky8.6/http/ks.bare.cfg index 5eebad91b..873906ad1 100644 --- a/packer-recipes/rocky8.6/http/ks.bare.cfg +++ b/packer-recipes/rocky8.6/http/ks.bare.cfg @@ -50,7 +50,6 @@ part / --size=1 --grow --asprimary --fstype=xfs # Postinstall %post --erroronfail set -ex -mkdir /opt/xcat # Add repos permanently dnf config-manager --add-repo https://yum.deepsquare.run/yum.repo @@ -60,7 +59,7 @@ dnf config-manager --add-repo https://www.beegfs.io/release/beegfs_7.3.1/dists/b dnf -y --nogpgcheck install https://download1.rpmfusion.org/nonfree/el/rpmfusion-nonfree-release-8.noarch.rpm dnf -y install unrar -# Install xCat provisioning service +# Install Grendel provisioning service cat << 'END' >/pull-postscript.sh #!/bin/sh set -ex @@ -116,7 +115,7 @@ cat <<'END' >/usr/lib/systemd/system/beegfs-helperd.service Description=BeeGFS Helperd Documentation=http://www.beegfs.com/content/documentation/ Requires=network-online.target -After=network-online.target xcatpostinit1.service +After=network-online.target grendel-postscript.service [Service] ExecStart=/opt/beegfs/sbin/beegfs-helperd cfgFile=/etc/beegfs/beegfs-helperd.conf runDaemonized=false @@ -133,7 +132,7 @@ Requires=network-online.target After=network-online.target local-fs.target time-sync.target beegfs-helperd.service \ beegfs-mgmtd.service beegfs-meta.service beegfs-storage.service openib.service openibd.service \ rdma.service opensmd.service opensm.service nss-lookup.target nss-user-lookup.target \ -slapd.service autofs.service ypbind.service nscd.service nslcd.service sshd.service xcatpostinit1.service +slapd.service autofs.service ypbind.service nscd.service nslcd.service sshd.service grendel-postscript.service [Service] Type=oneshot diff --git a/packer-recipes/rocky8.6/scripts-local/rsync-to-nfs b/packer-recipes/rocky8.6/scripts-local/rsync-to-nfs deleted file mode 100755 index 587624716..000000000 --- a/packer-recipes/rocky8.6/scripts-local/rsync-to-nfs +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -eux - -set -eux - -if [ $UID -ne 0 ]; then - echo "ERROR: Must be run as root!" >&2 - exit 1 -fi - -clean() { - echo 'Unmounting image...' - umount -f "$NFS_DIR" - rmdir --ignore-fail-on-non-empty "$NFS_DIR" - umount -f "$TMP_DIR" - rmdir --ignore-fail-on-non-empty "$TMP_DIR" -} - -trap clean EXIT - -export MOUNT=${MOUNT:-"10.10.2.11:/mnt/pool1/k8s/xcat"} -export EXPORT_PATH=${EXPORT_PATH:-/install/netboot/rocky8.6/x86_64/compute/rootimg/} -NFS_DIR=$(mktemp -d /tmp/nfs-XXXX) -TMP_DIR=$(mktemp -d /tmp/packer-XXXX) - -echo 'Mounting nfs' -mount -t nfs "$MOUNT" "$NFS_DIR" - -echo 'Mounting root partition...' -mount "${NBD}p1" "$TMP_DIR" - -echo "Content inside ${NFS_DIR}/${EXPORT_PATH}:" -ls -lah "${NFS_DIR}/${EXPORT_PATH}" - -echo "Preview: rsync -avzP --delete $TMP_DIR/ ${NFS_DIR}/${EXPORT_PATH}" -read -rp "Correct (this may delete everything in that directory and subdirectories) (y/N)?" choice -case "$choice" in -y | Y) ;; -*) umount -f "$NFS_DIR" && exit 0 ;; -esac - -rsync -avzP --delete "$TMP_DIR/" "${NFS_DIR}/${EXPORT_PATH}" diff --git a/packer-recipes/rocky8.6/scripts-local/rsync-to-xcat b/packer-recipes/rocky8.6/scripts-local/rsync-to-xcat deleted file mode 100755 index 746eeebda..000000000 --- a/packer-recipes/rocky8.6/scripts-local/rsync-to-xcat +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -ex - -set -eux - -if [ $UID -ne 0 ]; then - echo "ERROR: Must be run as root!" >&2 - exit 1 -fi - -clean() { - echo 'Unmounting image...' - umount -f "$TMP_DIR" - rmdir --ignore-fail-on-non-empty "$TMP_DIR" -} - -trap clean EXIT - -export XCAT_SERVER=${XCAT_SERVER:-root@10.10.2.160} -export EXPORT_PATH=${EXPORT_PATH:-/xcatdata/install/netboot/rocky8.6/x86_64/compute/rootimg/} - -TMP_DIR=$(mktemp -d /tmp/packer-XXXX) - -echo 'Mounting root partition...' -mount "${NBD}p1" "$TMP_DIR" - -echo "Content inside ${XCAT_SERVER}:${EXPORT_PATH}:" -ssh -p 2200 "${XCAT_SERVER}" ls -lah "${EXPORT_PATH}" - -echo "Preview: rsync -avzP --delete $TMP_DIR/ ${XCAT_SERVER}:${EXPORT_PATH}" -read -rp "Is the path ${XCAT_SERVER}:${EXPORT_PATH}, correct (this may delete everything in that directory and subdirectories) (y/N)?" choice -case "$choice" in -y | Y) ;; -*) exit 1 ;; -esac - -rsync -e 'ssh -p 2200' -avzP --delete "$TMP_DIR/" "${XCAT_SERVER}:${EXPORT_PATH}" diff --git a/web/docs/getting-started/00-overview.assets/getting-started.drawio.svg b/web/docs/getting-started/00-overview.assets/getting-started.drawio.svg new file mode 100644 index 000000000..ebf45dae8 --- /dev/null +++ b/web/docs/getting-started/00-overview.assets/getting-started.drawio.svg @@ -0,0 +1,4 @@ + + + +
Kubernetes Cluster
Kubernetes Cluster
Network Stack
Network Stack
Domain Name System
CoreDNS
Domain Name System...
Bare-metal LB for k8s
MetalLB
Bare-metal LB for k8...
Multi Network Interfaces
Multus
Multi Network Interf...
L7 LB and Reverse Proxy
Traefik
L7 LB and Reverse Pr...
Provisioning Stack
Provisioning Stack
Provisioning System
Grendel
Provisioning System...
GitOps Stack
GitOps Stack
Continuous Deployment
ArgoCD
Continuous Deploymen...
TLS certificates generation
cert-manager
TLS certificates gen...
Secrets management
sealed-secrets
Secrets management...
Compute Node
Compute Node
Compute Node
Compute Node
Compute Node
Compute Node
DNS Forward
DNS Forward
DNS
DNS
External
Networks
External...
HTTP
TCP
UDP
HTTP...
Announce Traefik IP
Announce Traefik IP
Expose Grendel to local network
Expose Grendel to local network
DHCP
PXE
TFTP
HTTP
DHCP...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/web/docs/getting-started/00-overview.md b/web/docs/getting-started/00-overview.md new file mode 100644 index 000000000..c248c193b --- /dev/null +++ b/web/docs/getting-started/00-overview.md @@ -0,0 +1,23 @@ +# Overview + +The purpose of this guide is to achieve the following objectives: + +- Install the Kubernetes Cluster along with its essential components such as: + - CoreDNS, which provides domain name resolution services + - MetalLB, which exposes Kubernetes Services to the local network + - Traefik, which offers HTTP/TCP/UDP L7 routing and load balancing functionality + - Multus, which uses IPVLAN to expose Grendel + - cert-manager and sealed-secrets, which provide secrets management capabilities +- Deploy ArgoCD for implementing GitOps practices. +- Utilize ArgoCD to deploy Grendel, which is a bare-metal provisioning system. +- Provision bare-metal nodes using Grendel. + +## Architecture + +![getting-started.drawio](./00-overview.assets/getting-started.drawio.svg#invert-on-dark) + +The Kubernetes Cluster functions as the control plane for managing compute nodes. **Therefore, the compute nodes which are not considered as part of the Kubernetes Cluster and do not run any Kubernetes workloads.** + +ClusterFactory is designed to prioritize bare-metal performance and thus leverages Slurm as the bare-metal batch job scheduler. + +If you intend to run Kubernetes workloads on compute nodes, you may install k0s using k0sctl. However, this approach may not be compatible with the ClusterFactory strategy. diff --git a/web/docs/getting-started/01-requirements-recommendations.md b/web/docs/getting-started/01-requirements-recommendations.md index 5653e22ba..05f2c8136 100644 --- a/web/docs/getting-started/01-requirements-recommendations.md +++ b/web/docs/getting-started/01-requirements-recommendations.md @@ -40,6 +40,9 @@ We recommend: ## Recommended documentation - [Kubernetes documentation](https://kubernetes.io/docs/concepts/) -- [Kubernetes API reference](https://kubernetes.io/docs/reference/kubernetes-api/) - [cfctl.yaml API reference](/docs/reference/cfctl.yaml) - [Argo CD declarative setup](https://argo-cd.readthedocs.io/en/stable/operator-manual/declarative-setup/) + +**Before using ClusterFactory, it is strongly advised to have a comprehensive understanding of how Kubernetes operates, specifically with regard to storage and network management using features such as PersistentVolume, StorageClass, Service, Ingresses, LoadBalancer, and more."** + +To try a "mini" version of Kubernetes we recommend k0s or minikube. diff --git a/web/docs/getting-started/02-setting-up-repository.md b/web/docs/getting-started/02-setting-up-repository.md index 9a1977a1c..3e12b7d8b 100644 --- a/web/docs/getting-started/02-setting-up-repository.md +++ b/web/docs/getting-started/02-setting-up-repository.md @@ -75,16 +75,22 @@ Git is capable of managing multiple remote repositories. By default, `origin` is # upstream DISABLE (push) ``` -## 3. Checkout to a stable version and create a new branch +## 3. (Optional) Checkout to a stable version and create a new branch You can checkout to a stable version: ```shell title="user@local:/ClusterFactory" -git checkout -b configs v0.7.0 -# You can delete the local main branch -git branch -D main +git checkout -b configs ``` +:::info + +Please note that ClusterFactory is under development, it is strongly recommended to follow the latest version of ClusterFactory. + +Unannounced breaking changes are to be expected. + +::: + ## 4. Rename the examples and commit Copy `argo.example`, `core.example`, `cfctl.yaml.example`, and remove the `.example`: @@ -163,9 +169,8 @@ Now that you have a fork, you can push your own changes into your repository. Fo │ ├── cvmfs-server/ │ ├── cvmfs-service/ │ ├── ipmi-exporter/ -│ ├── openldap/ │ ├── slurm-cluster/ -│ └── xcat/ +│ └── grendel/ ├── manifests/ <----- │ └── my-application/ <----- │ └── statefulset.yaml <----- diff --git a/web/docs/getting-started/03-k0s-configuration.md b/web/docs/getting-started/03-k0s-configuration.md index 00349233d..cb3dd1c1a 100644 --- a/web/docs/getting-started/03-k0s-configuration.md +++ b/web/docs/getting-started/03-k0s-configuration.md @@ -2,6 +2,24 @@ ## Specifying the hosts +Specify the nodes that will be included in the Kubernetes Cluster, which will function as the control plane for managing compute nodes. + +A node designated as a **controller** will run the following components: + +- An **etcd server**, which serves as the Kubernetes database. +- An **API server**, which serves as the entry point for `kubectl` commands. +- A **Pod scheduler**. +- A **controller-manager**, which acts as the central decision-making component of Kubernetes. +- A **Konnectivity-server**, responsible for facilitating communication between Kubernetes controller and worker nodes. +- A **k0s API**, which serves as the entry point for `k0s` commands. + +A node designated as a **worker** will solely run the following components: + +- A **kubelet**, which acts as an agent to communicate with the Konnectivity server +- **Containerd** containers, which refer to Pods running on the worker node. + +It is crucial to always have an odd number of controllers (1, 3, 5, ...) to prevent the cluster from getting stuck in a deadlock. + Edit the `cfctl.yaml` file. Start with the `hosts` field : ```yaml title=cfctl.yaml @@ -48,7 +66,7 @@ k0s: dynamicConfig: false config: apiVersion: k0s.k0sproject.io/v1beta1 - kind: Cluster + kind: ClusterConfig metadata: name: k8s.example.com spec: @@ -90,20 +108,22 @@ k0s: enabled: false ``` -Check the CIDR and make sure it doesn't conflict with any IP range of your network. - -Again, **you should read the specification carefully as the modification of one the k0s field won't be allowed in the future**. +Most of the values are already sane but you should check if the CIDR doesn't conflict with any IP range of your network. It is also recommended to tune manually the MTU and match it to your switch and router values. If you wish to use a HA setup, please follow [this guide](/docs/guides/maintenance/high-availability). ## Initial Deployment +:::tip + If you forgot to install the utilities, just run: ```shell title="user@local:/ClusterFactory" . ./scripts/setup-env ``` +::: + Deploy the cluster with: ```shell title="user@local:/ClusterFactory" diff --git a/web/docs/getting-started/04-core-apps-deployment.md b/web/docs/getting-started/04-core-apps-deployment.md index e87c9716b..44e630549 100644 --- a/web/docs/getting-started/04-core-apps-deployment.md +++ b/web/docs/getting-started/04-core-apps-deployment.md @@ -6,14 +6,15 @@ We will deploy: - MetalLB advertisements, for Load Balancing - CoreDNS, the internal DNS for Kubernetes - Sealed Secrets, secret management optimized for GitOps -- Cert-manager issuers, to generate your SSL certificates and enable, for free, TLS configuration. +- Cert-manager issuers, generate your SSL certificates and enable, for free, TLS configuration. - Argo CD, to enable GitOps. - Multus CNI, to support multiple network interfaces -- KubeVirt, to deploy VM workloads ## Configuring MetalLB -MetalLB is a L2/L3 load balancer designed for bare metal Kubernetes clusters. It exposes the kubernetes `Services` to the external network. It uses either L2 (ARP) or BGP to advertise routes. It is possible to make "zoned" advertisements with L2, but we heavily recommend to use BGP for multi-zone clusters. +We need to configure MetalLB to expose Kubernetes Services like Traefik to the external network. + +MetalLB is an L2/L3 load balancer designed for bare metal Kubernetes clusters. It exposes the Kubernetes `Services` to the external network. It uses either L2 (ARP) or BGP to advertise routes. It is possible to make "zoned" advertisements with L2, but we heavily recommend using BGP for multi-zone clusters.
@@ -82,7 +83,7 @@ spec: - 192.168.0.100/32 ``` -**Note that the address is part of the local network.** +By using ARP, every machine in the super net will be able to see that machine. For example, we are announcing 192.168.0.100. This IP is part of 192.168.0.0/24 and therefore, all the machines will be able to see 192.168.0.100. The indicated IP address will be allocated to the `LoadBalancer` Kubernetes Services, which is Traefik. @@ -103,7 +104,9 @@ That's all! The MetalLB speakers on all the nodes will advertise the IP address ## Configuring Traefik -You should configure Traefik, which is the main Ingress and L7 load balancer. `core/traefik/values.yaml` is the main configuration file. +Traefik is the main L7 load balancer and router. It is mostly used to route HTTP packets based on rules (URL path, headers, ...). + +To configure Traefik, edit the `core/traefik/values.yaml` file, which is the main configuration file. You should look for `loadBalancerIP` and the `metallb.universe.tf` annotations: @@ -176,7 +179,7 @@ ingressRoute: This means that the Traefik dashboard is accessible to `traefik.internel` on the `traefik` entry point, which is the 9000/tcp port. In short: [http://traefik.internal:9000/dashboard/](http://traefik.internal:9000/dashboard/) (the trailing slash is important). -Your DNS should be configured to redirect `traefik.internal` to the load balancer at `192.168.1.100` (or `192.168.0.100` if using L2). Fortunately, we can configure and expose our own DNS. +**Your DNS should be configured to redirect `traefik.internal` to the load balancer at `192.168.1.100` (or `192.168.0.100` if using L2). Fortunately, we configure and expose our the CoreDNS.** For the rest of the guide, we will assume that you have announced `192.168.1.100/32` to the router. @@ -188,66 +191,7 @@ CoreDNS will be exposed to the external network thanks to the `IngressRoute` obj :::caution -Since `hostPort` will be used, make sure the host does not have port 53/udp busy. On most systems with SystemD, this port is occupied by a stub listener. Open the `/etc/systemd/resolved.conf` configuration file on the host and disable the stub listener by setting `DNSStubListener` to `no`. Finally, restart the service with `systemctl restart systemd-resolved.service`. - -If this is an unwanted feature (because you are using an other DNS for example), feel free to remove the routes and close the ports in the Traefik configuration. - -```shell title="user@local:/ClusterFactory" -rm core/coredns/overlays/prod/ingress-route.yaml -``` - -```diff title="core/traefik/values.yaml" -ports: - traefik: - port: 9000 - expose: false - exposedPort: 9000 - protocol: TCP -- dns-tcp: -- port: 8053 -- expose: true -- exposedPort: 53 -- protocol: TCP -- dns-udp: -- port: 8054 -- expose: true -- exposedPort: 53 -- protocol: UDP -``` - -```diff title="core/coredns/overlays/prod/daemonset.yaml" -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: coredns -spec: - template: - spec: - containers: - - name: coredns - ports: - - name: dns - containerPort: 53 -- hostPort: 53 - protocol: UDP - - name: dns-tcp - containerPort: 53 -- hostPort: 53 - protocol: TCP - volumes: - - name: config-volume - configMap: - name: coredns - items: - - key: Corefile - path: Corefile - - key: example.com.db - path: example.com.db - - key: internal.db - path: internal.db - defaultMode: 420 - -``` +Since `hostPort` will be used, make sure the host does not have port 53/udp busy. On most systems with SystemD, this port is occupied by a stub listener. Open the `/etc/systemd/resolved.conf` configuration file on the Kubernetes hosts and disable the stub listener by setting `DNSStubListener` to `no`. Finally, restart the service with `systemctl restart systemd-resolved.service`. ```shell title="user@local:/ClusterFactory" ./scripts/deploy-core @@ -310,7 +254,7 @@ data: # Examples of external services 192.168.0.1 gateway.example.com 192.168.0.2 mn1.example.com - 192.168.0.3 xcatmn.example.com + 192.168.0.3 grendel.example.com 192.168.0.5 cvmfs.example.com 192.168.0.6 nfs.example.com 192.168.0.7 mysql.example.com @@ -326,23 +270,32 @@ data: 192.168.1.100 grafana.example.com ``` -Change the zones with your own and eventually change the `forward` field with your preferred DNS. -Change, add or remove service names as you wish. The `example.com.db` is only an example. +There are three DNS zones in this configuration: -Kubernetes Services going through the Load Balancer should use the MetalLB IP. +- The general zone `.:53`, which forwards DNS requests to `8.8.8.8` and announces the Kubernetes Services and Pod domain names. +- The internal zone `internal:53`, which contains rules to access the ArgoCD and Traefik dashboard. +- The internal zone `example.com:53`, which contains examples of rules to access to other services. -Compute nodes should be declared here. The IP should be the one declared on xCAT. +**Modify the zones with your own custom ones and update the `forward` field with your preferred DNS.** Additionally, you can add, remove or modify domain names as per your requirements. Please note the following: -The slurm controller node should take the IP of the kubernetes node on which the pod is hosted, as it uses `hostPort`. +- For Kubernetes Services that are routed through the Traefik Load Balancer, you must use the MetalLB IP. +- If you are using `hostPort` on your pod (such as the Slurm Controller), set the IP to be the Kubernetes host that is hosting the pod. +- If you are using IPVLAN, set the IP to be the IP that you declared in the IPVLAN settings. -You should configure the DNS of the machines to use CoreDNS. +You should configure the DNS of your machines to use CoreDNS. ```conf title="resolv.conf" nameserver 192.168.1.100 search example.com ``` -:::note +:::warning + +Be aware of the chicken-egg problem, you do NOT want to have the Kubernetes hosts using the DNS. + +::: + +:::warning If some files were added and removed, you must change the `daemonset.yaml`: @@ -371,33 +324,16 @@ Specify new certificate issuers in the `core/cert-manager` directory. It is highly recommended to add your own private certificate authority, follow the [official guide of cert-manager](https://cert-manager.io/docs/configuration/ca/). -You must create a Secret `ca-key-pair`: - -```yaml title="ca-key-pair-secret.yaml" -apiVersion: v1 -kind: Secret -metadata: - name: ca-key-pair - namespace: cert-manager -type: kubernetes.io/tls -stringData: - tls.crt: | - -----BEGIN CERTIFICATE----- - -----END CERTIFICATE----- - - tls.key: | - -----BEGIN RSA PRIVATE KEY----- - -----END RSA PRIVATE KEY----- -``` - -To generate a TLS certificate and its private key: +You must create a Secret `ca-key-pair`. To generate a TLS certificate and its private key: ```shell openssl genrsa -out tls.key 2048 openssl req -x509 -sha256 -new -nodes -key tls.key -days 3650 -out tls.crt +kubectl create secret tls ca-key-pair -n cert-manager --cert=tls.crt --key=tls.key +rm ca-key-pair-secret.yaml ``` -Seal it with `cfctl kubeseal` and apply it. +Then you can create a private ClusterIssuer: ```yaml title="private-cluster-issuer.yaml" apiVersion: cert-manager.io/v1 @@ -410,31 +346,33 @@ spec: secretName: ca-key-pair ``` -If you wish to use ACME HTTP-01, follow [this guide](https://cert-manager.io/docs/configuration/acme/http01/). This will create an Ingress by using the `ingress` field. +Edit the production ClusterIssuer to use your email address: -```yaml title="public-cluster-issuer.yaml" +```yaml title="production-cluster-issuer.yaml" apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: - name: public-cluster-issuer + name: production-cluster-issuer namespace: cert-manager spec: acme: email: john.smith@example.com server: https://acme-staging-v02.api.letsencrypt.org/directory privateKeySecretRef: - name: public-cluster-issuer-account-key + name: production-cluster-issuer-account-key solvers: - http01: ingress: class: traefik ``` +The production ClusterIssuer will contact the ACME servers to generate public TLS certificates on trusted root CA servers. + ## Configure the route and certificate for the ArgoCD dashboard -ArgoCD has a dashboard. To change the address and certificate, modify the `ingress-route.yaml` file and `certificate.yaml` in the `core/argo-cd` directory. +ArgoCD has a dashboard. To change the URL and certificate, modify the `ingress-route.yaml` file and `certificate.yaml` in the `core/argo-cd` directory. -**Make sure the addresses correspond to the ones defined in the CoreDNS (or in your private DNS).** +**Make sure the domain name correspond to the ones defined in the CoreDNS (or in your private DNS).** ```yaml title="Example of ingress-route.yaml for ArgoCD" apiVersion: traefik.containo.us/v1alpha1 @@ -468,40 +406,7 @@ spec: IngressRoute allows us to create more complex routing rules than the classic Ingress. However, Ingress can automatically generate a TLS certificate by using annotations, without the need to create a Certificate resource. -Example: - -```yaml title="ingress.yaml" -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: grafana-ingress - labels: - app.kubernetes.io/name: grafana-ingress - app.kubernetes.io/component: ingress - annotations: - cert-manager.io/cluster-issuer: selfsigned-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.tls: 'true' -spec: - ingressClassName: traefik - rules: - - host: grafana.example.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: grafana - port: - number: 80 - tls: - - hosts: - - grafana.example.com - secretName: grafana.example.com-secret -``` - -Our recommendation is to use Ingress for simple routes with HTTP. Otherwise, IngressRoute is the best solution for all the cases. +Our recommendation is to use Ingress for simple routes with HTTP. Otherwise, IngressRoute is the best solution for all cases. ## Deploying the core apps diff --git a/web/docs/getting-started/05-adding-repository-argocd.md b/web/docs/getting-started/05-adding-repository-argocd.md new file mode 100644 index 000000000..768b30ed6 --- /dev/null +++ b/web/docs/getting-started/05-adding-repository-argocd.md @@ -0,0 +1,29 @@ +# 4. Adding the Git repository to ArgoCD + +Argo CD can retrieve your repository from your Git hosting server, synchronize changes and deploy your Kubernetes manifests. + +1. Create a local secret containing [an SSH deploy key](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/managing-deploy-keys#set-up-deploy-keys) and the git URL: + +```yaml title="argo/default/secrets/my-repository-secret.yaml.local" +apiVersion: v1 +kind: Secret +metadata: + name: my-repository-secret + namespace: argocd + labels: + argocd.argoproj.io/secret-type: repository +type: Opaque +stringData: + sshPrivateKey: | + -----BEGIN RSA PRIVATE KEY----- + -----END RSA PRIVATE KEY----- + type: git + url: git@github.com:/.git +``` + +2. Seal it and apply it: + +```shell +cfctl kubeseal +kubectl apply -f argo/default/secrets/my-repository-sealed-secret.yaml +``` diff --git a/web/docs/getting-started/05-argo-apps-deployment.md b/web/docs/getting-started/05-argo-apps-deployment.md deleted file mode 100644 index 1038b4cec..000000000 --- a/web/docs/getting-started/05-argo-apps-deployment.md +++ /dev/null @@ -1,559 +0,0 @@ -# 5. Argo Apps Deployment - -Time to play with Argo CD! - -Let's deploy the [Kube-Prometheus-Stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml) since we will need it to monitor the nodes. - -A complete example is written in `argo/monitoring/`, but we will start from scratch to learn the process of writing an Argo CD application. - -Start by creating the `argo/my-monitoring` directory, this will be our working directory. - -Some objects shouldn't be handled by Argo CD, such as volumes, secrets and namespaces. These objects must be created before the deployment of an Argo application. - -## 1. Namespace and AppProject - -Start with a namespace: - -```yaml title="argo/my-monitoring/namespace.yaml" -apiVersion: v1 -kind: Namespace -metadata: - name: my-monitoring - labels: - app.kubernetes.io/name: my-monitoring -``` - -and apply: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/my-monitoring/namespace.yaml -``` - -And create an `AppProject`: - -```yaml title="argo/my-monitoring/app-project.yaml" -apiVersion: argoproj.io/v1alpha1 -kind: AppProject -metadata: - name: my-monitoring - namespace: argocd - # Finalizer that ensures that project is not deleted until it is not referenced by any application - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - description: My monitoring stack - # Allow manifests to deploy from any Git repos - sourceRepos: - - '*' - # Only permit applications to deploy to the namespace in the same cluster - destinations: - - namespace: my-monitoring - server: https://kubernetes.default.svc - - namespaceResourceWhitelist: - - kind: '*' - group: '*' - - clusterResourceWhitelist: - - kind: '*' - group: '*' -``` - -and apply: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/my-monitoring/app-project.yaml -``` - -`AppProject` configures the permissions of the `Application`. This is to avoid supply chain attacks (for example malicious resources get injected into the git repositories). You can learn more [here](https://argo-cd.readthedocs.io/en/stable/user-guide/projects/). - -## 2. Prepare Volumes, Secrets, ConfigMaps and Ingresses - -It is best to know the configuration you need before deploying. - -You can read the `values.yaml` file inside the [git repository of kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml). - -We won't be deploying AlertManager, but we will deploy Grafana, Prometheus and the Prometheus Operator. - -Therefore, we need: - -- A persistent volume for Grafana -- A persistent volume for Prometheus -- A secret with the initial admin password for Grafana -- An Ingress for Grafana -- An Ingress for Prometheus - -However, we are lucky that the Helm Chart of Kube-Prometheus-Stack already handles ingresses. So we only need to add two PVs and a secret. - -If you are not familiar with Helm, it's a software used for templating, similar to Ansible. Every variable is stored inside a `values.yaml` file. Helm can override these values by adding a additional values file on top (for example `values-production.yaml`). - -If we were to deploy manually, we would call: - -```shell title="user@local:/" -helm install \ - -n \ - -f values.yaml \ - -f values-production.yaml \ - \ - ./ -``` - -Helm is also a package manager. Like any package manager, you need a repository URL. If we were to install an app coming from a repository, we would call: - -```shell title="user@local:/" -helm repo add -helm repo update - -helm install \ - -n \ - -f values-production.yaml \ - \ - / -``` - -We would store these commands in scripts. However, Argo CD is capable of deploying Helm applications, but also Kustomize and vanilla Kubernetes definition files. Argo CD is also able to synchronize with the remote repository, which means that it can perform rolling updates. - -This way, we can centralize every definition, configuration and environments files inside a Git repository, with a common syntax, in YAML. - -More details on Argo CD [here](https://argo-cd.readthedocs.io/en/stable/). - -### 2.1. Volumes - -We are going the create 2 storage classes with the NFS CSI driver. -The deployment of k0s should have the [NFS CSI driver](https://github.com/kubernetes-csi/csi-driver-nfs) pre-deployed. - -You could use other types of storage like [Rook](https://rook.io) or [Longhorn](https://longhorn.io). - -```yaml title="argo/my-monitoring/storageclasses.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: grafana-nfs - namespace: my-monitoring - labels: - app: grafana-nfs - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/grafana - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion ---- -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: prometheus-nfs - namespace: my-monitoring - labels: - app: prometheus-nfs - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 -provisioner: nfs.csi.k8s.io -parameters: - server: nfs.example.com - share: /srv/nfs/k8s/prometheus - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - ch-sion -``` - -Apply it: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/my-monitoring/storageclasses.yaml -``` - -You could also create one unique StorageClass mounted on `/srv/nfs/k8s` for all the applications. However, this would mix all the volumes into a single directory and for the sake of the NFS server, we won't do that. - -You may notice that we've been using `topology.kubernetes.io/zone` since the beginning of the Getting Started. -It's a good practice to always annotate your nodes as some resources are not available in other zones. - -You can always learn more [here](https://kubernetes.io/docs/reference/labels-annotations-taints/#topologykubernetesiozone). - -You could also create a `PersistentVolume` and a `PersistentVolumeClaim` instead of a `StorageClass` (which is what we've done for [DeepSquare](https://deepsquare.io)). This is called static provisioning and is an acceptable solution. - -The official example of static provisioning is written [here](https://github.com/kubernetes-csi/csi-driver-nfs/tree/master/deploy/example). Both methods are good, but dynamic provisioning is more suitable for `StatefulSet` since it avoids creating a `PersistentVolume` for each replica. - -### 2.2. Secret - -Since we are doing GitOps, we will store a sealed secret in the git repository. Since the secret definition files are in plain text, to keep track of the version of the secrets, we need to push these files into git. - -To avoid storing clear-text secrets in git, SealedSecrets encrypts secrets using asymmetric encryption. Currently, a SealedSecrets controller should run on the Kubernetes cluster with a unique private key. This private key is the master key and should only be stored on the Kubernetes cluster. - -If you want to backup the key (because you want to do a migration, or to prepare for a disaster), you can follow [this guide](https://github.com/bitnami-labs/sealed-secrets#how-can-i-do-a-backup-of-my-sealedsecrets). You can also [backup the whole cluster using cfctl](/docs/guides/maintenance/backup-restore). - -:::warning - -The SealedSecrets keys and backups made by k0s are sensitive data! You should either delete them after a certain time or make sure that they are strongly protected. - -::: - -Create a secret named `grafana-admin-secret.yaml.local`. It is important to add `.local` at the end so it get filtered by Git. - -```yaml title="argo/my-monitoring/grafana-admin-secret.yaml.local" -apiVersion: v1 -kind: Secret -metadata: - name: grafana-admin-secret - namespace: my-monitoring -stringData: - admin-password: - admin-user: -type: Opaque -``` - -**DON'T APPLY IT**. First, we will encrypt it. - -Just run the `cfctl kubeseal`, this will generate a `grafana-admin-sealed-secret.yaml`. This file can be put inside the git repository. - -Apply this file: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/my-monitoring/grafana-admin-sealed-secret.yaml -``` - -After applying the file, feel free to delete the `-secret.yaml.local` file. If you wish to retrieve the secret, like any secret, just use `kubectl get secret -o jsonpath='{.data}'`. - -## 3. Configure the Argo Application - -Let's start with the CRDs of kube-prometheus-stack. Because the CRDs are too large, we need to deploy an Argo CD application which only deploys the CRDs. - -Create the file `argo/my-monitoring/prometheus-crd-app.yaml` and add: - -```yaml title="argo/my-monitoring/prometheus-crd-app.yaml" -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: prometheus-crd-app - namespace: argocd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: my-monitoring - source: - chart: kube-prometheus-stack - repoURL: https://github.com/prometheus-community/helm-charts.git - path: charts/kube-prometheus-stack/crds/ - targetRevision: kube-prometheus-stack-45.10.1 - - directory: - recurse: true - - destination: - server: 'https://kubernetes.default.svc' - namespace: my-monitoring - - syncPolicy: - automated: - prune: true # Specifies if resources should be pruned during auto-syncing ( false by default ). - selfHeal: true # Specifies if partial app sync should be executed when resources are changed only in target Kubernetes cluster and no git change detected ( false by default ). - allowEmpty: false # Allows deleting all application resources during automatic syncing ( false by default ). - syncOptions: - - Replace=true - retry: - limit: 5 # number of failed sync attempt retries; unlimited number of attempts if less than 0 - backoff: - duration: 5s # the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") - factor: 2 # a factor to multiply the base duration after each failed retry - maxDuration: 3m # the maximum amount of time allowed for the backoff strategy -``` - -Next, we need to configure the Argo CD application that actually deploys the kube-prometheus stack. - -However, in order to apply custom values and still be GitOps compliant, we will need to use the [subchart](https://helm.sh/docs/chart_template_guide/subcharts_and_globals/) pattern. - -To do that, in your fork, create a subchart or reuse the existing one: - -```yaml title="helm-subcharts/kube-prometheus-stack/Chart.yaml" -apiVersion: v2 -name: kube-prometheus-stack-subchart -description: Kube Prometheus Stack subchart -type: application -version: 45.10.1 -appVersion: '0.1.2' - -dependencies: - - name: kube-prometheus-stack - version: 45.10.1 - repository: https://prometheus-community.github.io/helm-charts -``` - -We will create the `values.yaml` file later on. Create the Argo CD Application which will use the subchart: - -```yaml title="argo/my-monitoring/prometheus-app.yaml" -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: prometheus-app - namespace: argocd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: my-monitoring - source: - # You should have forked this repo. Change the URL to your fork. - repoURL: git@github.com:/ClusterFactory.git - targetRevision: configs - path: helm-subcharts/kube-prometheus-stack - helm: - releaseName: prometheus - - skipCrds: true # skipCrds because CRDs are too long! - - # If the values file is not `values.yaml`: - # valueFiles: - # - values-example.yaml - - destination: - server: 'https://kubernetes.default.svc' - namespace: my-monitoring - - syncPolicy: - automated: - prune: true # Specifies if resources should be pruned during auto-syncing ( false by default ). - selfHeal: true # Specifies if partial app sync should be executed when resources are changed only in target Kubernetes cluster and no git change detected ( false by default ). - allowEmpty: false # Allows deleting all application resources during automatic syncing ( false by default ). - syncOptions: [] - retry: - limit: 5 # number of failed sync attempt retries; unlimited number of attempts if less than 0 - backoff: - duration: 5s # the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") - factor: 2 # a factor to multiply the base duration after each failed retry - maxDuration: 3m # the maximum amount of time allowed for the backoff strategy -``` - -More details [here](https://github.com/argoproj/argo-cd/blob/master/docs/operator-manual/application.yaml). - -Since [Kube-Prometheus-Stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml) is a Helm application, we are going to override some values by creating a `values.yaml` file inside the subchart. - -We are also going to configure the Ingresses here. - -```yaml title="helm-subcharts/kube-prometheus-stack/values.yaml" -kube-prometheus-stack: - alertmanager: - enabled: false - - ## Using default values from https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml - ## - grafana: - enabled: true - - image: - repository: grafana/grafana-oss - tag: 8.5.1 - - persistence: - type: pvc - enabled: true - storageClassName: grafana-nfs - - securityContext: - runAsUser: 472 - runAsGroup: 472 - fsGroup: 472 - - admin: - existingSecret: 'grafana-admin-secret' - userKey: admin-user - passwordKey: admin-password - - initChownData: - enabled: false - - ingress: - enabled: true - ingressClassName: traefik - - annotations: - cert-manager.io/cluster-issuer: selfsigned-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.tls: 'true' - - hosts: - - grafana.example.com - - path: / - - tls: - - secretName: grafana.example.com-secret - hosts: - - grafana.example.com - - ## Component scraping the kube controller manager - ## - kubeControllerManager: - enabled: false - - ## Component scraping coreDns. Use either this or kubeDns - ## - coreDns: - enabled: false - - ## Component scraping kubeDns. Use either this or coreDns - ## - kubeDns: - enabled: false - - ## Component scraping etcd - ## - kubeEtcd: - enabled: false - - ## Component scraping kube scheduler - ## - kubeScheduler: - enabled: false - - ## Component scraping kube proxy - ## - kubeProxy: - enabled: false - - ## Component scraping kube state metrics - ## - kubeStateMetrics: - enabled: true - - ## Configuration for kube-state-metrics subchart - ## - kube-state-metrics: - prometheus: - monitor: - enabled: true - - ## Deploy node exporter as a daemonset to all nodes - ## - nodeExporter: - enabled: true - - ## Configuration for prometheus-node-exporter subchart - ## - prometheus-node-exporter: - prometheus: - monitor: - enabled: true - - ## Manages Prometheus and Alertmanager components - ## - prometheusOperator: - enabled: true - - ## Resource limits & requests - ## - resources: - limits: - cpu: 200m - memory: 200Mi - requests: - cpu: 100m - memory: 100Mi - - ## Deploy a Prometheus instance - ## - prometheus: - enabled: true - - ingress: - enabled: true - - annotations: - cert-manager.io/cluster-issuer: selfsigned-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.tls: 'true' - - hosts: - - prometheus.example.com - - paths: - - / - - tls: - - secretName: prometheus.example.com-secret - hosts: - - prometheus.example.com - - prometheusSpec: - ruleSelectorNilUsesHelmValues: false - serviceMonitorSelectorNilUsesHelmValues: false - podMonitorSelectorNilUsesHelmValues: false - probeSelectorNilUsesHelmValues: false - - resources: - limits: - cpu: 1 - memory: 2Gi - requests: - cpu: 200m - memory: 2Gi - - storageSpec: - volumeClaimTemplate: - spec: - storageClassName: 'prometheus-nfs' - accessModes: ['ReadWriteOnce'] - resources: - requests: - storage: 50Gi -``` - -Now, you can commit and push: - -```shell title="user@local:/ClusterFactory" -git add . -git commit -m "Added kube-prometheus-stack subchart" -git push -``` - -You can deploy the Argo CD app: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/my-monitoring/prometheus-crd-app.yaml -kubectl apply -f argo/my-monitoring/prometheus-app.yaml -``` - -Congratulation, you have deployed an Argo CD app! - -You can observe the deployment in the Argo CD dashboard by following the URL [argocd.example.com](https://argocd.example.com). - -:::note - -To fetch the Argo CD password: - -```shell -kubectl get secret -n argocd argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 --decode) -``` - -::: - -![image-20220503170825336](04-argo-apps-deployment.assets/image-20220503170825336.png) - -![image-20220503171011051](04-argo-apps-deployment.assets/image-20220503171011051.png) - -Pretty cool, huh? - -However, ClusterFactory isn't just a Kubernetes Cluster. It contains all the apps necessary to create an HPC cluster. - -See the [guides](/docs/guides/monitoring/deploy) to deploy each application. Otherwise, let's deploy xCAT, our solution to deploy and manage bare metal servers! diff --git a/web/docs/getting-started/06-grendel-deployment.md b/web/docs/getting-started/06-grendel-deployment.md new file mode 100644 index 000000000..193f9bfdc --- /dev/null +++ b/web/docs/getting-started/06-grendel-deployment.md @@ -0,0 +1,390 @@ +# 6. Grendel Deployment + +The `argo/provisioning` directory deploys the Grendel application. + +## 1. Namespace and AppProject + +Create the Kubernetes namespace and ArgoCD AppProject. + +```shell title="user@local:/ClusterFactory" +kubectl apply -f argo/provisioning +``` + +Kubernetes namespaces are used to isolate workloads and organize the Kubernetes cluster application. + +ArgoCD AppProjects are used in the continuous deployment process to prevent unauthorized deployment of resources. The more restrictive this is, the better we can avoid a supply chain attack. + +## 2. Preparing the dynamic provisioning of volumes + +Grendel needs to store its OS images. We will use NFS for the storage in this guide but there are other solution like OpenEBS or local-path (see the local-path-storage ArgoCD application in the `argo/local-path-storage` directory). + +We need to deploy a StorageClass so that Kubernetes can dynamically provision volumes. + +Look at the `argo/volumes/dynamic-nfs.yaml`: + +```yaml title="argo/volumes/dynamic-nfs.yaml" +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: dynamic-nfs + labels: + topology.kubernetes.io/region: ch-sion + topology.kubernetes.io/zone: ch-sion-1 +provisioner: nfs.csi.k8s.io +parameters: + server: nfs.example.com + share: /srv/nfs/dynamic + mountPermissions: '0775' +mountOptions: + - hard + - nfsvers=4.1 + - noatime + - nodiratime +volumeBindingMode: Immediate +reclaimPolicy: Retain +allowedTopologies: + - matchLabelExpressions: + - key: topology.kubernetes.io/region + values: + - ch-sion +``` + +Change the server address `nfs.example.com` to your NFS server and apply the resource. + +```yaml +kubectl apply -f argo/volumes/dynamic-nfs.yaml +``` + +## 3. Apps + +Since Grendel is using DHCP (and therefore L2 networking), we need to connect Grendel to the network connected to the compute nodes. To do that, we use Multus CNI with IPVLan. + +Let's start with the ArgoCD application declaration: + +```yaml title="argo/provisioning/apps/grendel-app.yaml" +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grendel-app + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: provisioning + source: + # You should have forked this repo. Change the URL to your fork. + repoURL: git@github.com:/ClusterFactory.git + # You should use your branch too. + targetRevision: HEAD + path: helm/grendel + helm: + releaseName: grendel + + # We will create a values file inside the fork and change the values. + valueFiles: + - values-production.yaml + + destination: + server: 'https://kubernetes.default.svc' + namespace: provisioning + + syncPolicy: + automated: + prune: true # Specifies if resources should be pruned during auto-syncing ( false by default ). + selfHeal: true # Specifies if partial app sync should be executed when resources are changed only in target Kubernetes cluster and no git change detected ( false by default ). + allowEmpty: false # Allows deleting all application resources during automatic syncing ( false by default ). + syncOptions: [] + retry: + limit: 5 # number of failed sync attempt retries; unlimited number of attempts if less than 0 + backoff: + duration: 5s # the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") + factor: 2 # a factor to multiply the base duration after each failed retry + maxDuration: 3m # the maximum amount of time allowed for the backoff strategy +``` + +Most of the options don't need to change, so just add `values-production.yaml` to the `valueFiles` field because we will create a `values-production.yaml`. + +If you've looked inside the `helm/grendel/` directory, you can see the default `values.yaml`. To change these values, add the `values-production.yaml` file directly inside the helm application. + +## 4. Values configuration + +### Sticking the Grendel Pod to the right zone + +After adding the `values-production.yaml` file in the helm application directory. We can start by selecting where Grendel will be hosted: + +```yaml title="helm/grendel/values-production.yaml" +nodeSelector: + kubernetes.io/hostname: k0s-1.example.com +``` + +Since we are using IPVLAN, the pod needs to be stuck on a Kubernetes node with a known network interface. + +### Grendel Configuration Secret + +Grendel needs a configuration file which contains credentials. Therefore, you need to create a secret with the `grendel.toml` inside. Create a `grendel-secret.yaml.local` with the following content: + +```yaml title="argo/provisioning/secrets/grendel-secret.yaml.local" +apiVersion: v1 +kind: Secret +metadata: + name: grendel-secret + namespace: provisioning +type: Opaque +stringData: + grendel.toml: | + dbpath = ":memory:" + loggers = {cli="on", tftp="on", dhcp="on", dns="off", provision="on", api="on", pxe="on"} + admin_ssh_pubkeys = [] + + [provision] + listen = "0.0.0.0:80" + token_ttl = 3600 + root_password = "" + default_image = "" + repo_dir = "/var/lib/grendel" + + [dhcp] + listen = "0.0.0.0:67" + lease_time = "24h" + dns_servers = [] + domain_search = [] + mtu = 1500 + proxy_only = false + router_octet4 = 0 + subnets = [ + {gateway = "192.168.0.1/24", dns="192.168.0.100", domainSearch="example.com", mtu="1500"} + ] + + [dns] + listen = "0.0.0.0:53" + ttl = 86400 + + [tftp] + listen = "0.0.0.0:69" + + [pxe] + listen = "0.0.0.0:4011" + + [api] + socket_path = "/var/run/grendel/grendel-api.socket" + + [client] + api_endpoint = "/var/run/grendel/grendel-api.socket" + insecure = false + + [bmc] + user = "admin" + password = "password" + + [discovery] + user = "" + password = "" + domain = "" +``` + +**You need to change the `dhcp.subnets` configuration according to your network configuration.** + +Seal the secret and apply it: + +```shell title="user@local:/ClusterFactory" +cfctl kubeseal +kubectl apply -f argo/provisioning/secrets/grendel-sealed-secret.yaml +``` + +### Nodes configuration + +After adding the `values-production.yaml` file in the helm application directory. We can start by adding the provisioning configuration: + +```yaml title="helm/grendel/values-production.yaml" +config: + ## Secret containing grendel.toml + secretName: grendel-secret + secretKey: grendel.toml + + hosts: + - name: cn1 + provision: true + boot_image: squareos-8.6 + interfaces: + - ip: 10.10.2.51/24 + mac: aa:bb:cc:11:22:33 + bmc: false + - ip: 10.10.3.51/32 + bmc: true + + images: + - name: squareos-8.6 + kernel: '/var/lib/grendel/vmlinuz-4.18.0-372.19.1.el8_6.x86_64' + initrd: + - '/var/lib/grendel/initramfs-4.18.0-372.19.1.el8_6.x86_64.img' + liveimg: '/var/lib/grendel/squareos-8.6.squashfs' + cmdline: console=ttyS0 console=tty0 root=live:http://grendel.example.com/repo/squareos-8.6.squashfs BOOTIF=01-{{ $.nic.MAC | toString | replace ":" "-" }} grendel.hostname={{ $.host.Name }} grendel.address=http://grendel.example.com rd.live.overlay.readonly=1 rd.live.overlay.overlayfs=1 rd.neednet=1 + + postscript: | + #!/bin/sh + touch /hello-world +``` + +The MAC address corresponds to the network interface connected to the network with Grendel. + +Inside the image configuration, you can notice some kernel parameters: + +- `console=ttyS0 console=tty0` means that the kernel messages will appear on both the first serial port and virtual terminal. +- `root=live:http://grendel.example.com/repo/squareos-8.6.squashfs` means that dracut will load the OS image as a live OS image. **Modify the URL based on the domain name you want to use.** +- `rd.live.overlay.readonly=1 rd.live.overlay.overlayfs=1 rd.neednet=1` are parameters relative to loading the live OS image. Here, we are mounting the OS image as a read-only base image for the OverlayFS. This is to create a stateless filesystem. +- `grendel.hostname={{ $.host.Name }} grendel.address=http://grendel.example.com` are parameters used to change the hostname of the OS and fetch the postscript. **Modify the URL based on the domain name you want to use.** + +### Persistence + +Remember the `dynamic-nfs` storage class we've just created? Let's use it now: + +```yaml title="helm/grendel/values-production.yaml" +persistence: + storageClassName: 'provisioning-nfs' + accessModes: ['ReadWriteMany'] + size: 20Gi + selectorLabels: + app: grendel +``` + +This will create a PersistentStorageClaim asking for 20Gi to the NFS provisioner. The NFS provisioner will create a directory inside the NFS with the following path `/srv/nfs/dynamic/pvc-`. The UUID in randomized. + +### IPVLAN configuration + +To expose Grendel to the external network, instead of using `LoadBalancers`, we use [Multus](https://github.com/k8snetworkplumbingwg/multus-cni). Generally, Multus is a CNI plugin to attach multiple network interfaces on Pods. However, we will use Multus CNI to replace the default network interface with an IPVLAN interface. + +IPVLAN allows us to directly expose the pod to the host network by assigning an IP to the pod. To do that, you must specify the network interface of the node with the `masterInterface` field. Then, you should allocate an address using the `ipam` field. + +```yaml title="helm/grendel/values-production.yaml" +net: + # Kubernetes host interface + masterInterface: eth0 + mode: l2 + type: ipvlan + + # https://www.cni.dev/plugins/current/ipam/static/ + ipam: + type: static + addresses: + - address: 192.168.0.3/24 + gateway: 10.10.2.1 + routes: + - dst: 0.0.0.0/0 + + # https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config + dns: + nameservers: + - 1.1.1.1 +``` + +More details on IPAM [here](https://www.cni.dev/plugins/current/ipam/static/) and for IPVLAN [here](https://www.cni.dev/plugins/current/main/ipvlan/). + +### (Optional) IPMI API configuration + +The helm application can also deploy an IPMI API. This API doesn't use L2, so we can expose that service through Traefik by using an Ingress: + +```yaml title="helm/grendel/values-production.yaml" +ipmi: + ingress: + enabled: true + ingressClass: 'traefik' + + annotations: + cert-manager.io/cluster-issuer: private-cluster-issuer + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: 'true' + + hosts: + - ipmi.example.com + + path: / + + tls: + - secretName: ipmi.example.com-secret + hosts: + - ipmi.example.com +``` + +With this, you can use `cfctl` to control your nodes. + +## CoreDNS configuration + +Remember to add a DNS entry each time you want to expose an application: + +```yaml title="core/coredns/overlays/prod/configmap.yaml" +data: + example.com.db: | + # ... + 192.168.0.3 grendel.example.com + 192.168.1.100 ipmi.example.com +``` + +## Commit, Push, Deploy + +Commit and push: + +```shell +git add . +git commit -m "Added Grendel application and values" +git push +``` + +Deploy the app: + +```shell title="user@local:/ClusterFactory" +kubectl apply -f argo/provisioning/apps/grendel-app.yaml +``` + +## (Optional) Building the OS Image + +This step is optional, you can download a pre-built SquareOS image: + +- [initramfs](https://sos-ch-dk-2.exo.io/osimages/squareos-8.6/initramfs-4.18.0-372.19.1.el8_6.x86_64.img) +- [OS image](https://sos-ch-dk-2.exo.io/osimages/squareos-8.6/squareos-8.6.squashfs) +- [linux kernel](https://sos-ch-dk-2.exo.io/osimages/squareos-8.6/vmlinuz-4.18.0-372.19.1.el8_6.x86_64) + +If you want to build it yourself, we use Packer to build the OS image. To build the OS image: + +- 1. Install Packer and QEMU. +- 2. Go to the `packer-recipes/rocky8.6`. +- 3. Build the OS image using the `build.bare.sh` script. +- 4. Extract the kernel, initramfs and create the squashfs file using the `export.bare.sh` script. + +## Adding the OS Image to Grendel + +After deploying Grendel, a file server is exposed for you to copy the OS images. + +You can access using this URL: http://grendel.example.com:8080 + +Drag & Drop the OS image, linux kernel and initramfs there. + +## BIOS configuration + +Make sure your nodes are configured with network boot as the first boot option. Grendel supports: + +- x86 Legacy +- x86 UEFI +- x86_64 UEFI +- ARM64 UEFI + +## IPMI commands, rebooting and provision + +If you've deployed the IPMI API, you can run: + +```shell title="user@local:/ClusterFactory" +export IPMIUSER= +export IPMIPASS= +export IPMIADDRESS=https://ipmi.example.com +cfctl ipmi +``` + +Reboot the nodes with `cfctl ipmi cn1 reset`. + +Read the logs of Grendel and the serial console of your node to see if the boot is successful. + +## Congratulation! + +You've finished the guide. However, there is still a lot of application we didn't deploy. Continue on these guides if you are interested: + +- [Deploy SLURM, the bare-metal batch scheduler](/docs/guides/slurm/deploy-slurm) +- [Configure the postscript to follow the GitOps practices](/docs/guides/provisioning/gitops-with-grendel) diff --git a/web/docs/getting-started/06-xcat-deployment.md b/web/docs/getting-started/06-xcat-deployment.md deleted file mode 100644 index 3a4950c56..000000000 --- a/web/docs/getting-started/06-xcat-deployment.md +++ /dev/null @@ -1,185 +0,0 @@ -# 6. xCAT Deployment - -The `argo/provisioning` directory deploys the xCAT application. - -This time, we won't start from scratch. - -However, the order is the same. - -## 1. Namespace and AppProject - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning -``` - -## 2. Volumes - -Start with the xCAT volume. This is where xCAT will be storing SQLite databases, os images and more. - -```yaml title="argo/provisioning/volumes/xcat-pv.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: xcat-pv - namespace: provisioning - labels: - app: xcat -spec: - capacity: - storage: 50Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: 0993a28c-4c2f-4edb-94ee-ec0e8a20efff - volumeAttributes: - server: nfs.example.com - share: '/srv/nfs/k8s/xcat' - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/volumes/xcat-pv.yaml -``` - -The label `app=xcat` will be used by the `PersistentVolumeClaim` of the `StatefulSet` to locate the `PersistentVolume`. - -You can use a StorageClass if you want. We won't be running multiple xCAT replicas anyway. - -## 3. Apps - -Because, xCAT MUST use the host network for provisioning the bare metal servers, we will use Multus CNI to expose the pod to the external network. - -xCAT will deploy a lot of services including: - -- A DHCP Server -- A TFTP Server -- A RSync Server -- A DNS Server -- xCAT Services -- And [more](https://xcat-docs.readthedocs.io/en/stable/advanced/ports/xcat_ports.html) - -That's why we will use the Multus and CNI plugins to solve this particular problem. - -Let's start with the obvious: - -```yaml title="argo/provisioning/apps/xcat-app.yaml" -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: xcat-app - namespace: argocd - finalizers: - - resources-finalizer.argocd.argoproj.io -spec: - project: provisioning - source: - # You should have forked this repo. Change the URL to your fork. - repoURL: git@github.com:/ClusterFactory.git - # You should use your branch too. - targetRevision: HEAD - path: helm/xcat - helm: - releaseName: xcat - - # We will create a values file inside the fork and change the values. - valueFiles: - - values-production.yaml - - destination: - server: 'https://kubernetes.default.svc' - namespace: provisioning - - syncPolicy: - automated: - prune: true # Specifies if resources should be pruned during auto-syncing ( false by default ). - selfHeal: true # Specifies if partial app sync should be executed when resources are changed only in target Kubernetes cluster and no git change detected ( false by default ). - allowEmpty: false # Allows deleting all application resources during automatic syncing ( false by default ). - syncOptions: [] - retry: - limit: 5 # number of failed sync attempt retries; unlimited number of attempts if less than 0 - backoff: - duration: 5s # the amount to back off. Default unit is seconds, but could also be a duration (e.g. "2m", "1h") - factor: 2 # a factor to multiply the base duration after each failed retry - maxDuration: 3m # the maximum amount of time allowed for the backoff strategy -``` - -To edit the values, we won't need to use the subchart pattern because xCat is already defined inside the git repository. Add the `values-production.yaml` file directly inside the helm application: - -```yaml title="helm/xcat/values-production.yaml" -nodeSelector: - topology.kubernetes.io/region: ch-sion - topology.kubernetes.io/zone: ch-sion-1 - -resources: - requests: - cpu: '250m' - memory: '8Gi' - limits: - cpu: '8' - memory: '8Gi' - -persistence: - storageClassName: '' - accessModes: ['ReadWriteOnce'] - size: 50Gi - selectorLabels: - app: xcat - -net: - # Kubernetes host interface - masterInterface: eno2 - mode: l2 - type: ipvlan - - # https://www.cni.dev/plugins/current/ipam/static/ - ipam: - type: static - addresses: - - address: 192.168.0.3/24 - gateway: 192.168.0.1 - routes: - - dst: 0.0.0.0/0 - - dns: - nameservers: - - 127.0.0.1 - searches: - - example.com -``` - -`nodeSelector` is very useful to make sure that xCAT stays in the right zone. - -If you are using a StorageClass, remove the `persistence.selectorLabels` field. - -Let's focus on the `net` field. To expose xCAT to the external network, instead of using `LoadBalancers`, we use [Multus](https://github.com/k8snetworkplumbingwg/multus-cni). Multus is a CNI plugin to attach multiple network interfaces on Pods. - -However, we will use Multus CNI to replace the default network interface with a IPVLAN interface. - -IPVLAN allows us to directly expose the pod to the host network. To do that, you must specify the network interface of the node with the `masterInterface` field. Then, you should allocate an address using the `ipam` field. - -More details on IPAM [here](https://www.cni.dev/plugins/current/ipam/static/) and for IPVLAN [here](https://www.cni.dev/plugins/current/main/ipvlan/). - -This way, instead of using a Virtual Machine to deploy xCAT, you can use a container! - -Commit and push: - -```shell -git add . -git commit -m "Added xCAT application and values" -git push -``` - -Deploy the app: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/apps/xcat-app.yaml -``` - -Login to xCAT using the indicated IP address `ssh root@192.168.0.3 -p 2200` (the password is `cluster`). diff --git a/web/docs/getting-started/07-xcat-configuration.md b/web/docs/getting-started/07-xcat-configuration.md deleted file mode 100644 index a2fe7dc9a..000000000 --- a/web/docs/getting-started/07-xcat-configuration.md +++ /dev/null @@ -1,498 +0,0 @@ -# 7. xCAT Configuration - -The configuration of xCAT doesn't follow the GitOps ways and certainly doesn't follow the declarative way since we need to SSH to the pod. - -In the future, we plan to develop and integrate this feature. For now, let's just SSH to the container. - -In this guide, we will try to get as close as possible to the "declarative" method. xCAT works with stanza files. The whole xCAT cluster can be configured with one big stanza file. - -It looks like this: - -```shell -compute01: - objtype=node - arch=x86_64 - mgt=ipmi - cons=ipmi - bmc=10.1.0.12 - nictypes.etn0=ethernet - nicips.eth0=11.10.1.3 - nichostnamesuffixes.eth0=-eth0 - nicnetworks.eth0=clstrnet1 - nictypes.eth1=ethernet - nicips.eth1=60.0.0.7|70.0.0.7 - nichostnamesuffixes.eth1=-eth1|-eth1-lab - nicnetworks.eth1=clstrnet2|clstrnet3 - nicaliases.eth0="alias1 alias2" - nicaliases.eth1="alias3|alias4" -``` - -A full cluster looks like this: - -```shell -# - -montbhandler.pm: - objtype=notification - tableops=a,u,d - tables=monsetting - -192_168_0_0-255_255_255_0: - objtype=network - domain=ch1.deepsquare.run - gateway=192.168.0.1 - mask=255.255.255.0 - mgtifname=ens18 - mtu=1500 - nameservers=192.168.1.100 - net=192.168.0.0 - tftpserver= - -ib0ipv41: - objtype=network - mask=255.255.255.0 - mgtifname=ib0 - net=192.168.1.0 - -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/xcatdata/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - kernelver=4.18.0-305.17.1.el8_4.x86_64 - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - postbootscripts=git-configs-execute its-a-fake-password-dont-worry compute - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute - -cn1: - objtype=node - addkcmdline=modprobe.blacklist=nouveau crashkernel=256M - arch=x86_64 - bmc=10.10.3.51 - bmcpassword=password - bmcusername=admin - cons=ipmi - consoleenabled=1 - currstate=netboot rocky8.6-x86_64-compute - groups=compute,all - ip=192.168.0.51 - mac=ab:cd:ef:12:34:56 - mgt=ipmi - netboot=xnba - nicips.ib0=192.168.1.51 - nicnetworks.ib0=ib0ipv41 - nictypes.ib0=Infiniband - os=rocky8.6 - postbootscripts=nvidia-xorg - profile=compute - provmethod=rocky8.6-x86_64-netboot-compute - serialport=1 - serialspeed=115200 - status=booted - statustime=05-03-2022 10:23:10 - updatestatus=synced - updatestatustime=03-23-2022 10:27:05 - -1: - objtype=policy - name=root - rule=allow - -1.2: - objtype=policy - name=xcatmn - rule=trusted - -2: - objtype=policy - commands=getbmcconfig - rule=allow - -2.1: - objtype=policy - commands=remoteimmsetup - rule=allow - -2.3: - objtype=policy - commands=lsxcatd - rule=allow - -3: - objtype=policy - commands=nextdestiny - rule=allow - -4: - objtype=policy - commands=getdestiny - rule=allow - -4.4: - objtype=policy - commands=getpostscript - rule=allow - -4.5: - objtype=policy - commands=getcredentials - rule=allow - -4.6: - objtype=policy - commands=syncfiles - rule=allow - -4.7: - objtype=policy - commands=litefile - rule=allow - -4.8: - objtype=policy - commands=litetree - rule=allow - -4.9: - objtype=policy - commands=getadapter - rule=allow - -all: - objtype=group - members=cn1 - -compute: - objtype=group - members=cn1 - -clustersite: - objtype=site - SNsyncfiledir=/var/xcat/syncfiles - auditnosyslog=0 - auditskipcmds=ALL - blademaxp=64 - cleanupdiskfullxcatpost=no - cleanupxcatpost=no - consoleondemand=no - databaseloc=/var/lib - db2installloc=/mntdb2 - dhcplease=43200 - dnshandler=ddns - domain=xcat.provisioning.svc.cluster.local - enableASMI=no - forwarders=10.96.0.10 - fsptimeout=0 - installdir=/install - ipmimaxp=64 - ipmiretries=3 - ipmitimeout=2 - master=192.168.0.3 - maxssh=8 - nameservers=192.168.0.3 - nodesyncfiledir=/var/xcat/node/syncfiles - powerinterval=0 - ppcmaxp=64 - ppcretry=3 - ppctimeout=0 - sharedtftp=1 - sshbetweennodes=ALLGROUPS - syspowerinterval=0 - tftpdir=/tftpboot - timezone=Etc/UCT - useNmapfromMN=no - vsftp=n - xcatconfdir=/etc/xcat - xcatdport=3001 - xcatiport=3002 - -rocky8.6-x86_64: - objtype=osdistro - arch=x86_64 - basename=rocky - dirpaths=/install/rocky8.6/x86_64 - majorversion=8 - minorversion=4 - type=Linux -``` - -Some fields are auto-generated. So let's just configure the network, the OS Image and the nodes. - -## Network configuration - -```shell title="mystanzafile" -192_168_0_0-255_255_255_0: - objtype=network - domain=example.com - gateway=192.168.0.1 - mask=255.255.255.0 - mgtifname=ens18 - mtu=1500 - nameservers=192.168.1.100 - net=192.168.0.0 - tftpserver= -``` - -Adapt the configuration to your network and apply the stanza: - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -And regenerate the DNS and DHCP configuration: - -```shell title="ssh root@xcat" -echo "reconfiguring hosts..." -makehosts -echo "reconfiguring dns..." -makedns -echo "reconfiguring dhcpd config..." -makedhcp -n -echo "reconfiguring dhcpd leases..." -makedhcp -a -``` - -More details [here](https://xcat-docs.readthedocs.io/en/latest/guides/admin-guides/references/man5/networks.5.html). - -For Infiniband, follow [this guide](https://xcat-docs.readthedocs.io/en/stable/advanced/networks/infiniband/network_configuration.html). - -## OS Image configuration - -Use Packer to build OS images. - -You can build the SquareFactory OS image using the recipes stored in `packer-recipes`. It runs RedHat Kickstart and installs all the dependencies. - -You can then copy the root filesystem to xCAT using rsync. Follow the [guide "Build an OS Image with Packer" for more details](/docs/guides/provisioning/packer-build). - -```shell title="mystanzafile" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - kernelver=4.18.0-305.17.1.el8_4.x86_64 - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - postbootscripts=git-configs-execute its-a-fake-password-dont-worry compute - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute -``` - -:::note - -Since we are doing GitOps, we do not need to use the xCAT provisioning system. Therefore, we set `pkgdir=/tmp` and `pkglist=/dev/null`. - -::: - -Our root filesystem is stored inside `/install/netboot/rocky8.6/x86_64/compute/rootimg`. - -The file `/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist` contains a list files/directories that are trimmed before packing the image. - -Create the file and add: - -```shell title="/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist" -./boot* -./usr/include* -./usr/lib/locale* -./usr/lib64/perl5/Encode/CN* -./usr/lib64/perl5/Encode/JP* -./usr/lib64/perl5/Encode/TW* -./usr/lib64/perl5/Encode/KR* -./lib/kbd/keymaps/i386* -./lib/kbd/keymaps/mac* -./lib/kdb/keymaps/include* -./usr/local/include* -./usr/local/share/man* -./usr/share/man* -./usr/share/cracklib* -./usr/share/doc* -./usr/share/gnome* -./usr/share/i18n* -+./usr/share/i18n/en_US* -./usr/share/info* -./usr/share/locale/* -+./usr/share/locale/en_US* -+./usr/share/locale/C* -+./usr/share/locale/locale.alias -+./usr/lib/locale/locale-archive -+./usr/lib/locale/en* -./usr/share/man* -./usr/share/omf* -./usr/share/vim/site/doc* -./usr/share/vim/vim74/doc* -./usr/share/zoneinfo* -./var/cache/man* -./var/lib/yum* -./tmp* -``` - -Create one post-boot script inside `/install/postscripts` called `git-configs-execute`, which `git clone` and executes scripts from a git repository. - -For example: - -```shell title="/install/postscripts/git-configs-execute" -#!/bin/sh -# Params: -# 1: password for the ssh key -# 2: node type (compute or private) - -set -x - -mkdir -p /configs -cat << EOF > /key.enc -# An encrypted private key using: -# openssl enc -aes-256-cbc -a -salt -pbkdf2 -in id_ed25519_api -out id_ed25519_api.enc -EOF -chmod 600 /key.enc -echo "$1" | openssl aes-256-cbc -d -a -pbkdf2 -in /key.enc -out /key -pass stdin -chmod 600 /key -GIT_SSH_COMMAND='ssh -i /key -o IdentitiesOnly=yes' git clone git@github.com:SquareFactory/compute-configs.git /configs -if [ -f /configs/post.sh ] && [ -x /configs/post.sh ]; then - cd /configs || exit 1 - ./post.sh "$2" -fi -rm -f /key /key.env - -# Security -chmod -R g-rwx,o-rwx . -``` - -This script clones `git@github.com:SquareFactory/compute-configs.git` and executes `post.sh` inside the git repository. - -This script enables us to use Git as the source of truth instead of xCAT. - -To apply the stanza: - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -To generate the kernel and initrd for the netboot, call: - -```shell title="ssh root@xcat" -geninitrd rocky8.6-x86_64-netboot-compute -``` - -To pack the image as SquashFS, call: - -```shell title="ssh root@xcat" -packimage -m squashfs -c pigz rocky8.6-x86_64-netboot-compute -``` - -More details [here](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/references/man5/osimage.5.html). - -:::danger - -When using a diskless configuration, the image generated loses its linux capabilities. - -To determine which capabilities you need to restore, move to `/install/netboot/rocky8.6/x86_64/compute/rootimg` inside the xCAT container and run: - -```shell title="ssh root@xcat:/install/netboot/rocky8.6/x86_64/compute/rootimg" -{ - echo "#!/bin/bash" - echo "cd /" - find . |xargs getcap|awk -F= '{print "setcap" $2 " " $1}' -} > restorecap -chmod +x restorecap -mv restorecap /install/postscripts/restorecap -``` - -This command will create a `restorecap` script that you will need to add as postscript: - -```shell title="mystanzafile" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - kernelver=4.18.0-305.17.1.el8_4.x86_64 - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - postbootscripts=restorecap,git-configs-execute its-a-fake-password-dont-worry compute - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute -``` - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -::: - -## Node configuration - -```shell title="mystanzafile" -cn1: - objtype=node - addkcmdline=modprobe.blacklist=nouveau crashkernel=256M - arch=x86_64 - bmc=10.10.3.51 - bmcpassword=password - bmcusername=admin - cons=ipmi - consoleenabled=1 - currstate=netboot rocky8.6-x86_64-compute - groups=compute,all - ip=192.168.0.51 - mac=ab:cd:ef:12:34:56 - mgt=ipmi - netboot=xnba - os=rocky8.6 - profile=compute - provmethod=rocky8.6-x86_64-netboot-compute - serialport=1 - serialspeed=115200 -``` - -Our compute node, which is outside of the Kubernetes cluster, has a BMC which permits us to configure the compute node via IPMI. More details on the architecture [here](/docs/main-concepts/apps/xcat). - -Adapt the configuration, then apply the stanza: - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -Regenerate the DNS and DHCP configuration: - -```shell title="ssh root@xcat" -echo "reconfiguring hosts..." -makehosts -echo "reconfiguring dns..." -makedns -echo "reconfiguring dhcpd config..." -makedhcp -n -echo "reconfiguring dhcpd leases..." -makedhcp -a -``` - -And regenerate the PXE boot configuration: - -```shell title="ssh root@xcat" -nodeset osimage=rocky8.6-x86_64-netboot-compute -``` - -More details [here](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/references/man7/node.7.html). - -## Deploy - -```shell title="ssh root@xcat" -rpower cn1 on # or rpower cn1 reset -``` - -Congratulations, you've deployed a bare-metal server! xCAT is a heavy beast, but a complete bare metal provisioner. We recommend that you familiarize yourself with the software very quickly by reading the [xCAT documentation](https://xcat-docs.readthedocs.io/en/stable/overview/index.html). - -The next steps should be to configure your compute nodes and install a job scheduler like Slurm so you can run parallel jobs! diff --git a/web/docs/guides/01-setting-up-repository.md b/web/docs/guides/01-setting-up-repository.md index e66eac694..8c478feb0 100644 --- a/web/docs/guides/01-setting-up-repository.md +++ b/web/docs/guides/01-setting-up-repository.md @@ -179,7 +179,7 @@ If you want to deploy your applications, you should write your manifests and com │ │ └── values.yaml │ ├── openldap/ │ ├── slurm-cluster/ -│ └── xcat/ +│ └── grendel/ ├── manifests/ <----- Or HERE if it's a kustomized/vanilla Kubernetes application │ └── my-application/ <----- │ └── statefulset.yaml <----- @@ -258,4 +258,4 @@ spec: kubectl apply -f argo/default/apps/my-application.yaml ``` -Argo CD will deploy and synchroize automatically by following the HEAD commit. You can also specify the branch instead of `HEAD`. +Argo CD will deploy and synchronize automatically by following the HEAD commit. You can also specify the branch instead of `HEAD`. diff --git a/web/docs/guides/40-monitoring/01-deploy.md b/web/docs/guides/40-monitoring/01-deploy.md index 17f3d6a1f..a5272302e 100644 --- a/web/docs/guides/40-monitoring/01-deploy.md +++ b/web/docs/guides/40-monitoring/01-deploy.md @@ -26,168 +26,7 @@ Like in the Getting Started, we won't be deploying Thanos and AlertManager. kubectl apply -f argo/monitoring/ ``` -## 2. Persistent Volumes, PVC and Secrets - -### 2.a. Creating a `StorageClass` or `PersistentVolume` - -We will use NFS. Feel free to use another type of storage. We recommend at least 100 GB since the storage is used to store the root file system of the operating system images. - - - - -```yaml title="argo/monitoring/volumes/storage-classes.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: grafana-nfs - namespace: monitoring - labels: - app: grafana - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/grafana - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - # - ---- -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: prometheus-nfs - namespace: monitoring - labels: - app: prometheus - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/prometheus - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - # - -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/monitoring/volumes/storage-classes.yaml -``` - - - - -```yaml title="argo/monitoring/volumes/persistent-volumes.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: grafana-pv - namespace: monitoring - labels: - app: grafana - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 100Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/grafana - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain ---- -apiVersion: v1 -kind: PersistentVolume -metadata: - name: prometheus-pv - namespace: monitoring - labels: - app: prometheus - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 100Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/prometheus - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/monitoring/volumes/persistent-volumes.yaml -``` - -The label `app=prometheus` will be used by the PersistentVolumeClaim. - -We also need a PVC for Grafana: - -```yaml title="argo/monitoring/volumes/persistent-volume-claim.yaml" -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: grafana-pv-claim - namespace: monitoring -spec: - volumeName: grafana-pv - storageClassName: '' - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/monitoring/volumes/persistent-volume-claim.yaml -``` - - - - -### 2.b. Add initial Grafana password as Secret +## 2. Secrets Create a SealedSecret which contains the initial credentials for Grafana: @@ -247,9 +86,6 @@ Read the [`values.yaml`](https://github.com/prometheus-community/helm-charts/blo Create a `values.yaml` inside the `helm-subcharts/kube-prometheus-stack` directory. - - - ```yaml title="helm-subcharts/kube-prometheus-stack/values.yaml" kube-prometheus-stack: alertmanager: @@ -265,7 +101,7 @@ kube-prometheus-stack: persistence: type: pvc enabled: true - storageClassName: grafana-nfs + storageClassName: dynamic-nfs securityContext: runAsUser: 472 @@ -399,176 +235,13 @@ kube-prometheus-stack: storageSpec: volumeClaimTemplate: spec: - storageClassName: 'prometheus-nfs' + storageClassName: 'dynamic-nfs' accessModes: ['ReadWriteOnce'] resources: requests: storage: 50Gi ``` - - - -```yaml title="helm-subcharts/kube-prometheus-stack/values.yaml" -kube-prometheus-stack: - alertmanager: - enabled: false - - grafana: - enabled: true - - image: - repository: grafana/grafana-oss - tag: 8.4.5 - - persistence: - type: pvc - enabled: true - existingClaim: grafana-pv-claim - - securityContext: - runAsUser: 472 - runAsGroup: 472 - fsGroup: 472 - - admin: - existingSecret: 'grafana-admin-secret' - userKey: admin-user - passwordKey: admin-password - - initChownData: - enabled: false - - ingress: - enabled: true - ingressClassName: traefik - - annotations: - cert-manager.io/cluster-issuer: selfsigned-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.tls: 'true' - - hosts: - - grafana.example.com - - path: / - - tls: - - secretName: grafana.example.com-secret - hosts: - - grafana.example.com - - ## Component scraping the kube controller manager - ## - kubeControllerManager: - enabled: false - - ## Component scraping coreDns. Use either this or kubeDns - ## - coreDns: - enabled: false - - ## Component scraping kubeDns. Use either this or coreDns - ## - kubeDns: - enabled: false - - ## Component scraping etcd - ## - kubeEtcd: - enabled: false - - ## Component scraping kube scheduler - ## - kubeScheduler: - enabled: false - - ## Component scraping kube proxy - ## - kubeProxy: - enabled: false - - ## Component scraping kube state metrics - ## - kubeStateMetrics: - enabled: true - - ## Configuration for kube-state-metrics subchart - ## - kube-state-metrics: - prometheus: - monitor: - enabled: true - - ## Deploy node exporter as a daemonset to all nodes - ## - nodeExporter: - enabled: true - - ## Configuration for prometheus-node-exporter subchart - ## - prometheus-node-exporter: - prometheus: - monitor: - enabled: true - - ## Manages Prometheus and Alertmanager components - ## - prometheusOperator: - enabled: true - - ## Deploy a Prometheus instance - ## - prometheus: - enabled: true - - ingress: - enabled: true - - annotations: - cert-manager.io/cluster-issuer: selfsigned-cluster-issuer - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress .kubernetes.io/router.tls: 'true' - - hosts: - - prometheus.example.com - - paths: - - / - - tls: - - secretName: prometheus.example.com-secret - hosts: - - prometheus.example.com - - prometheusSpec: - ruleSelectorNilUsesHelmValues: false - serviceMonitorSelectorNilUsesHelmValues: false - podMonitorSelectorNilUsesHelmValues: false - probeSelectorNilUsesHelmValues: false - - resources: - limits: - cpu: 1 - memory: 2Gi - requests: - cpu: 200m - memory: 2Gi - - storageSpec: - volumeClaimTemplate: - spec: - volumeName: 'prometheus-pv' - storageClassName: '' - accessModes: ['ReadWriteOnce'] - resources: - requests: - storage: 50Gi -``` - - - - In case you don't know how to use `Ingress` with `cert-manager` and Traefik. Use the annotations `traefik.ingress.kubernetes.io/router.entrypoints` and `traefik.ingress.kubernetes.io/router.tls` to indicate the port used by Traefik. The `cfctl.yaml` indicates that the entry-point `websecure` is port 443. diff --git a/web/docs/guides/40-monitoring/_category_.json b/web/docs/guides/40-monitoring/_category_.json index 957159c28..1b751fdd0 100644 --- a/web/docs/guides/40-monitoring/_category_.json +++ b/web/docs/guides/40-monitoring/_category_.json @@ -1,3 +1,3 @@ { - "label": "Monitoring Stack" + "label": "Monitoring" } diff --git a/web/docs/guides/50-provisioning/01-deploy-grendel.mdx b/web/docs/guides/50-provisioning/01-deploy-grendel.mdx new file mode 100644 index 000000000..1e348c28d --- /dev/null +++ b/web/docs/guides/50-provisioning/01-deploy-grendel.mdx @@ -0,0 +1,3 @@ +# How to deploy Grendel + +Please read the [Getting Started](/docs/getting-started/grendel-deployment). diff --git a/web/docs/guides/50-provisioning/01-deploy-xcat.assets/image-20220506142457636.png b/web/docs/guides/50-provisioning/01-deploy-xcat.assets/image-20220506142457636.png deleted file mode 100644 index 76aa1f73e..000000000 Binary files a/web/docs/guides/50-provisioning/01-deploy-xcat.assets/image-20220506142457636.png and /dev/null differ diff --git a/web/docs/guides/50-provisioning/01-deploy-xcat.assets/xcat.drawio.svg b/web/docs/guides/50-provisioning/01-deploy-xcat.assets/xcat.drawio.svg deleted file mode 100644 index 933d786f0..000000000 --- a/web/docs/guides/50-provisioning/01-deploy-xcat.assets/xcat.drawio.svg +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - - - - -
-
-
- xCAT pod -
-
-
-
- xCAT pod -
-
- - - - -
-
-
eth0
-
-
-
- eth0 -
-
- - - - -
-
-
- K8s Node -
-
-
-
- K8s Node -
-
- - - - -
-
-
192.168.0.51
-
-
-
- 192.168.0.51 -
-
- - - - -
-
-
192.168.0.1/24
-
-
-
- 192.168.0.1/24 -
-
- - - - - - - - - - - - - - - - - -
-
-
- Compute Node -
-
-
-
- Compute... -
-
- - - - -
-
-
BMC (IPMI) Network
-
-
-
- BMC (IPMI)... -
-
- - - - -
-
-
10.10.3.51
-
-
-
- 10.10.3.51 -
-
- - - - -
-
-
10.10.3.1/24
-
-
-
- 10.10.3.1/24 -
-
- - - - -
-
-
192.168.0.X
-
-
-
- 192.168.0.X -
-
- - - - -
-
-
192.168.0.3
-
-
-
- 192.168.0.3 -
-
- - - - -
-
-
MAC: 00:11:22:aa:bb:cc
-
-
-
- MAC: 00:11:22:aa:bb:cc -
-
- - - - -
-
-
MAC: 00:11:22:aa:bb:cc
-
-
-
- MAC: 00:11:22:aa:bb:cc -
-
- - - - -
-
-
- Router -
-
-
-
- Router -
-
- - - - -
-
-
Management Network
-
-
-
- Management... -
-
- -
- - - - Text is not SVG - cannot display - - -
diff --git a/web/docs/guides/50-provisioning/01-deploy-xcat.mdx b/web/docs/guides/50-provisioning/01-deploy-xcat.mdx deleted file mode 100644 index c8754164b..000000000 --- a/web/docs/guides/50-provisioning/01-deploy-xcat.mdx +++ /dev/null @@ -1,247 +0,0 @@ -# How to deploy xCAT - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -![xCAT deployment graph](01-deploy-xcat.assets/image-20220506142457636.png) - -## Helm and Docker resources - -The Helm resources are stored on [ClusterFactory Git Repository](https://github.com/SquareFactory/ClusterFactory/tree/main/helm/xcat). - -The Dockerfile is described in the git repository [SquareFactory/xcat-rocky](https://github.com/SquareFactory/xcat-rocky). - -An Docker image can be pulled with: - -```shell -docker pull ghcr.io/squarefactory/xcat-rocky:latest -``` - -## 1. Deploy Namespace and AppProject - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/ -``` - -## 2. Creating a `StorageClass` or `PersistentVolume` - -We will use NFS. Feel free to use another type of storage. We recommend at least 100 GB since the storage is used to store the root file system of the operating system images. - - - - -```yaml title="argo/provisioning/volumes/storage-class.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: xcat-nfs - namespace: provisioning - labels: - app: xcat - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/xcat - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - # - -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/volumes/storage-class.yaml -``` - - - - -```yaml title="argo/provisioning/volumes/persistent-volume.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: xcat-pv - namespace: provisioning - labels: - app: xcat - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 100Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/xcat - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/volumes/persistent-volume.yaml -``` - -The label `app=xcat` will be used by the PersistentVolumeClaim. - - - - -## 3. Editing `xcat-app.yaml` to use the fork - -```yaml title="argo.example/provisioning/apps/xcat-app.yaml > spec > source" -source: - # You should have forked this repo. Change the URL to your fork. - repoURL: git@github.com:/ClusterFactory.git - # You should use your branch too. - targetRevision: HEAD - path: helm/xcat - helm: - releaseName: xcat - - valueFiles: - - values-production.yaml -``` - -## 4. Adding custom values - -:::tip - -Read the [`values.yaml`](https://github.com/SquareFactory/ClusterFactory/blob/main/helm/xcat/values.yaml) to see all the default values. - -::: - -### 4.a. Add the values file to the chart - -Create `values-production.yaml` inside the `helm/xcat/` directory. - -### 4.b. Selecting a zone - -xCAT will use the host network. Make sure that xCAT stays on the same network by using the `nodeSelector`. - -Your nodes should be annotated with `topology.kubernetes.io/region` and `topology.kubernetes.io/zone`. - -```yaml title="helm/xcat/values-production.yaml" -nodeSelector: - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -``` - -### 4.c. Network configuration - -xCAT will be connected to the host network using the IPVLAN CNI plugin. Make sure that Multus CNI is already installed. - -:::tip - -Check with: - -```shell -> kubectl get daemonset -n kube-system kube-multus-ds -NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE -kube-multus-ds 1 1 1 1 1 28d -``` - -If it isn't already deployed, you can deploy Multus with: - -```shell -kubectl apply -f https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset-thick-plugin.yaml -``` - -::: - -![xcat network diagram](./01-deploy-xcat.assets/xcat.drawio.svg) - -Similar to configuring the network interface of a Virtual Machine, you must change these fields: - -```yaml title="title="helm/xcat/values-production.yaml" -# ... - -net: - # Kubernetes host interface - masterInterface: eth0 - mode: l2 - type: ipvlan - - # https://www.cni.dev/plugins/current/ipam/static/ - ipam: - type: static - addresses: - - address: 192.168.0.3/24 - gateway: 192.168.0.1 - routes: - - dst: 0.0.0.0/0 - - # https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config - dns: - nameservers: - - 127.0.0.1 -``` - -Use IPAM `static` since xCAT has a ISC DHCP server. - -### 4.d. Volume configuration - - - - -Edit the values accordingly: - -```yaml title="helm/xcat/values-production.yaml" -# ... -persistence: - storageClassName: 'xcat-nfs' - accessModes: ['ReadWriteOnce'] - size: 50Gi -``` - - - - -```yaml title="helm/xcat/values-production.yaml" -# ... -persistence: - storageClassName: '' - accessModes: ['ReadWriteOnce'] - size: 50Gi - selectorLabels: - app: xcat - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -``` - - - - -## 5. Deploy xCAT - -Commit and push: - -```shell title="user@local:/ClusterFactory" -git add . -git commit -m "Added xCAT application and values" -git push -``` - -And deploy: - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/provisioning/apps/xcat-app.yaml -``` diff --git a/web/docs/guides/50-provisioning/02-packer-build.md b/web/docs/guides/50-provisioning/02-packer-build.md index a9bfc31d9..d5b19b132 100644 --- a/web/docs/guides/50-provisioning/02-packer-build.md +++ b/web/docs/guides/50-provisioning/02-packer-build.md @@ -1,19 +1,19 @@ # Build an OS Image with Packer -## Rocky Linux OS Image +## SquareOS Image ### Configuring and Launching Packer The [`packer-recipes` directory inside the git repository](https://github.com/SquareFactory/ClusterFactory/tree/main/packer-recipes) contains examples of Packer configuration files. -```json title="rocky.nothing.json" +```json title="compute.bare.json" { "variables": { "boot_wait": "3s", - "disk_size": "10G", - "iso_checksum": "5a0dc65d1308e47b51a49e23f1030b5ee0f0ece3702483a8a6554382e893333c", - "iso_url": "https://download.rockylinux.org/pub/rocky/8/isos/x86_64/Rocky-8.5-x86_64-boot.iso", - "memsize": "8192", + "disk_size": "50G", + "iso_checksum": "fe77cc293a2f2fe6ddbf5d4bc2b5c820024869bc7ea274c9e55416d215db0cc5", + "iso_url": "https://download.rockylinux.org/vault/rocky/8.6/isos/x86_64/Rocky-8.6-x86_64-boot.iso", + "memsize": "2048", "numvcpus": "4" }, "builders": [ @@ -23,7 +23,7 @@ The [`packer-recipes` directory inside the git repository](https://github.com/Sq "communicator": "none", "boot_command": [ " ", - "inst.ks=http://{{ .HTTPIP }}:{{ .HTTPPort }}/ks.nothing.cfg ", + "inst.ks=http://{{ .HTTPIP }}:{{ .HTTPPort }}/ks.bare.cfg ", "inst.cmdline", "" ], @@ -51,215 +51,32 @@ When running Packer, the process is the following: - Run the Kickstart RedHat file. This file automates the installation process of the OS. - Shut down the VM. -Configure the installation process by editing the `http/ks.nothing.cfg`: - -```shell title="http/ks.nothing.cfg" -url --url="https://dl.rockylinux.org/vault/rocky/8.4/BaseOS/x86_64/os/" -# License agreement -eula --agreed -# Disable Initial Setup on first boot -firstboot --disable -# Poweroff after the install is finished -poweroff -# Firewall -firewall --enabled --service=ssh -# Disable Initial Setup on first boot -firstboot --disable -ignoredisk --only-use=vda -# System language -lang en_US.UTF-8 -# Keyboard layout -keyboard us -# Network information -network --bootproto=dhcp --device=eth0 -# SELinux configuration -selinux --disabled -# System timezone -timezone UTC --utc -# System bootloader configuration -bootloader --location=mbr --driveorder="vda" --timeout=1 -# Root password -rootpw --plaintext an_example_of_default_password -# System services -services --enabled="chronyd" - -repo --name="AppStream" --baseurl=https://dl.rockylinux.org/vault/rocky/8.4/AppStream/x86_64/os/ -repo --name="Extras" --baseurl=https://dl.rockylinux.org/vault/rocky/8.4/extras/x86_64/os/ -repo --name="PowerTools" --baseurl=https://dl.rockylinux.org/vault/rocky/8.4/PowerTools/x86_64/os/ -repo --name="epel" --baseurl=https://mirror.init7.net/fedora/epel/8/Everything/x86_64/ - -# Clear the Master Boot Record -zerombr -# Remove partitions -clearpart --all --initlabel -# Automatically create partition -part / --size=1 --grow --asprimary --fstype=xfs - -# Postinstall -%post --erroronfail -set -ex - -# Postinstall -#-- No firewall -systemctl disable firewalld - -# Install xCat provisioning service -curl -fsSL "https://raw.githubusercontent.com/xcat2/xcat-core/master/xCAT/postscripts/xcatpostinit1.netboot" -o /opt/xcat/xcatpostinit1 -chmod 755 /opt/xcat/xcatpostinit1 - -curl -fsSL "https://raw.githubusercontent.com/xcat2/xcat-core/master/xCAT/postscripts/xcatpostinit1.service" -o /etc/systemd/system/xcatpostinit1.service -ln -s "../xcatpostinit1.service" "/etc/systemd/system/multi-user.target.wants/xcatpostinit1.service" - -# Kickstart copies install boot options. Serial is turned on for logging with -# Packer which disables console output. Disable it so console output is shown -# during deployments -sed -i 's/^GRUB_TERMINAL=.*/GRUB_TERMINAL_OUTPUT="console"/g' /etc/default/grub -sed -i '/GRUB_SERIAL_COMMAND="serial"/d' /etc/default/grub -sed -ri 's/(GRUB_CMDLINE_LINUX=".*)\s+console=ttyS0(.*")/\1\2/' /etc/default/grub - -# Clean up install config not applicable to deployed environments. -for f in resolv.conf fstab; do - rm -f /etc/$f - touch /etc/$f - chown root:root /etc/$f - chmod 644 /etc/$f -done - -cat << EOF >>/etc/fstab -devpts /dev/pts devpts gid=5,mode=620 0 0 -tmpfs /dev/shm tmpfs defaults 0 0 -proc /proc proc defaults 0 0 -sysfs /sys sysfs defaults 0 0 -EOF - -rm -f /etc/sysconfig/network-scripts/ifcfg-[^lo]* - -dnf clean all -%end - -%packages --excludedocs --excludedocs -@minimal-environment -chrony - -bash-completion -cloud-init -# cloud-init only requires python3-oauthlib with MAAS. As such upstream -# removed this dependency. -python3-oauthlib -rsync -tar -# grub2-efi-x64 ships grub signed for UEFI secure boot. If grub2-efi-x64-modules -# is installed grub will be generated on deployment and unsigned which breaks -# UEFI secure boot. -grub2-efi-x64 -efibootmgr -shim-x64 -dosfstools -lvm2 -mdadm -device-mapper-multipath -iscsi-initiator-utils - --plymouth -# Remove Intel wireless firmware --i*-firmware -%end -``` +Configure the installation process by editing the `http/ks.bare.cfg`. The `packages` and `post` blocks are probably what you are interested in. Run packer with: ```shell title="user@local:/ClusterFactory/packer-recipes/rocky" -packer build -var "numvcpus=12" -var "memsize=23609" rocky.nothing.json +packer build -var "numvcpus=12" -var "memsize=23609" compute.bare.json ``` -The next steps is to export to xCAT, which can be a little tricky. - -### Exporting to xCAT - -After building the OS image, a qcow2 file is generated inside a `output-qemu` directory. +### Extracting the kernel, initramfs and create a squashfs for Grendel -To extract the root filesystem, you need to have `qemu-nbd` and `qemu-img` installed. +There the script `export.bare.sh` which can help you extract the kernel, initramfs and create a squashfs. -Edit and execute the `export.bare.sh` script **as root** to export the root filesystem to xCAT (xCAT root password should be `cluster`): +You must run it by root. ```shell title="export.bare.sh" -#!/bin/bash -ex - -export IMAGE_PATH=output-qemu/packer-qemu -export XCAT_SERVER=root@192.168.0.3 # You may need to edit this -export EXPORT_PATH=/install/netboot/rocky8.6/x86_64/compute/rootimg/ - -teardown() { - source ./scripts-local/teardown-nbd -} - -trap teardown EXIT - +# Mount the disk source ./scripts-local/setup-nbd -source ./scripts-local/rsync-to-xcat -``` - -:::caution - -Make sure it exports to the right place. The `--delete` flag has been set in the script and it can delete anything in the `EXPORT_PATH` directory. - -::: - -:::info - -If you have `qemu-nbd: Failed to blk_new_open 'output-qemu/packer-qemu': Failed to get "write" lock`, it means the block device is already mounted. - -You should run `qemu-nbd -d /dev/nbdX` to dismount a block devices. - -::: - -### Configure the OS Image on xCAT - -SSH to the xCAT server (`ssh root@192.168.0.3 -p 2200`). - -Create a stanza: - -```shell title="osimage.stanza" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - profile=compute - provmethod=netboot - rootimgdir=/install/netboot/rocky8.6/x86_64/compute - pkgdir=/tmp - pkglist=/dev/null -``` - -:::note - -Since we are doing GitOps, we do not need to use the xCAT provisioning system. Therefore, we set `pkgdir=/tmp` and `pkglist=/dev/null`. - -::: - -And apply it: - -```shell title="ssh root@xcat" -cat osimage.stanza | mkdef -z -``` - -### Generate the initramfs and pack the image as squashfs - -Generate the kernel and initrd for the netboot: - -```shell title="ssh root@xcat" -geninitrd rocky8.6-x86_64-netboot-compute -``` +# Extract initramfs and kernel +./scripts-local/exec-dracut -To pack the image as SquashFS, call: +# Squash the image +OUTPUT=squareos-8.6.squashfs ./scripts-local/squash-root -```shell title="ssh root@xcat" -packimage -m squashfs -c pigz rocky8.6-x86_64-netboot-compute +# Unmount the disk +source ./scripts-local/teardown-nbd ``` diff --git a/web/docs/guides/50-provisioning/03-configure-xcat.md b/web/docs/guides/50-provisioning/03-configure-xcat.md deleted file mode 100644 index b743eecdd..000000000 --- a/web/docs/guides/50-provisioning/03-configure-xcat.md +++ /dev/null @@ -1,309 +0,0 @@ -# Configure xCAT to provision the nodes - -:::info - -In the next version of ClusterFactory, xCAT will be a Kubernetes operator. - -This means that the stanza file for the definition of the cluster can be written in YAML, and there will be no need to SSH to xCAT. - -::: - -## Network Configuration - -The name of the object is precise. You can SSH to xCAT and type -`lsdef -t network` to look for the name of the network. Otherwise, the name of -the network looks like this `192_168_0_0-255_255_255_0`, which is the one configured with Multus CNI. - -```shell title="network.stanza" -192_168_0_0-255_255_255_0: - objtype=network - domain=example.com - gateway=192.168.0.1 - mask=255.255.255.0 - mgtifname=ens18 - mtu=1500 - nameservers=192.168.1.100 - net=192.168.0.0 - tftpserver= -``` - -:::caution - -Don't replace ``. - -::: - -Edit the file [accordingly](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/references/man5/networks.5.html). - -Apply the stanza: - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -And regenerate the DNS and DHCP configuration: - -```shell title="ssh root@xcat" -echo "reconfiguring hosts..." -makehosts -echo "reconfiguring dns..." -makedns -echo "reconfiguring dhcpd config..." -makedhcp -n -echo "reconfiguring dhcpd leases..." -makedhcp -a -``` - -More details [here](https://xcat-docs.readthedocs.io/en/latest/guides/admin-guides/references/man5/networks.5.html). - -For Infiniband, follow [this guide](https://xcat-docs.readthedocs.io/en/stable/advanced/networks/infiniband/network_configuration.html). - -## OS Image configuration - -Use Packer to build OS images. - -You can build the SquareFactory OS image using the recipes stored in `packer-recipes`. Basically, it runs RedHat Kickstart and install all the software needed for [DeepSquare](https://deepsquare.io). - -After building the image, you should copy the root filesystem via `rsync` or `scp`. Follow [this guide for more information](/docs/guides/provisioning/packer-build). - -Create the stanza: - -```shell title="osimage.stanza" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute -``` - -:::note - -Since we are doing GitOps, we do not need to use the xCAT provisioning system. Therefore, we set `pkgdir=/tmp` and `pkglist=/dev/null`. - -::: - -Our root filesystem is stored inside `/install/netboot/rocky8.6/x86_64/compute/rootimg`. - -The file `/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist` contains a list files/directories that are trimmed before packing the image. - -Create the file and add: - -```shell title="/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist" -./boot* -./usr/include* -./usr/lib/locale* -./usr/lib64/perl5/Encode/CN* -./usr/lib64/perl5/Encode/JP* -./usr/lib64/perl5/Encode/TW* -./usr/lib64/perl5/Encode/KR* -./lib/kbd/keymaps/i386* -./lib/kbd/keymaps/mac* -./lib/kdb/keymaps/include* -./usr/local/include* -./usr/local/share/man* -./usr/share/man* -./usr/share/cracklib* -./usr/share/doc* -./usr/share/gnome* -./usr/share/i18n* -+./usr/share/i18n/en_US* -./usr/share/info* -./usr/share/locale/* -+./usr/share/locale/en_US* -+./usr/share/locale/C* -+./usr/share/locale/locale.alias -+./usr/lib/locale/locale-archive -+./usr/lib/locale/en* -./usr/share/man* -./usr/share/omf* -./usr/share/vim/site/doc* -./usr/share/vim/vim74/doc* -./usr/share/zoneinfo* -./var/cache/man* -./var/lib/yum* -./tmp* -``` - -Edit [accordingly](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/basic_concepts/xcat_object/osimage.html), and apply it: - -```shell title="ssh root@xcat" -cat osimage.stanza | mkdef -z -``` - -`/install/netboot/rocky8.6/x86_64/compute/rootimg` should contains the root file-system. - -`/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist` contains a list files/directories that are trimmed before packing the image. - -Example: - -```shell title="/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist" -./boot* -./usr/include* -./usr/lib/locale* -./usr/lib64/perl5/Encode/CN* -./usr/lib64/perl5/Encode/JP* -./usr/lib64/perl5/Encode/TW* -./usr/lib64/perl5/Encode/KR* -./lib/kbd/keymaps/i386* -./lib/kbd/keymaps/mac* -./lib/kdb/keymaps/include* -./usr/local/include* -./usr/local/share/man* -./usr/share/man* -./usr/share/cracklib* -./usr/share/doc* -./usr/share/gnome* -./usr/share/i18n* -+./usr/share/i18n/en_US* -./usr/share/info* -./usr/share/locale/* -+./usr/share/locale/en_US* -+./usr/share/locale/C* -+./usr/share/locale/locale.alias -+./usr/lib/locale/locale-archive -+./usr/lib/locale/en* -./usr/share/man* -./usr/share/omf* -./usr/share/vim/site/doc* -./usr/share/vim/vim74/doc* -./usr/share/zoneinfo* -./var/cache/man* -./var/lib/yum* -./tmp* -``` - -Generate the kernel and initrd for the netboot: - -```shell title="ssh root@xcat" -geninitrd rocky8.6-x86_64-netboot-compute -``` - -To pack the image as SquashFS, call: - -```shell title="ssh root@xcat" -packimage -m squashfs -c pigz rocky8.6-x86_64-netboot-compute -``` - -:::caution - -Even if no logs are shown, the process is running. You should wait until the end of the command. - -You must allocate enough `tmp` for the process to work. Inside the xCAT Helm `values`, you can use: - -```yaml -tmp: - medium: 'Memory' - size: 50Gi -``` - -If you wish to build inside the RAM. - -::: - -:::danger - -When using a diskless configuration, the image generated loses its linux capabilities. - -To determine which capabilities you need to restore, move to `/install/netboot/rocky8.6/x86_64/compute/rootimg` inside the xCAT container and run: - -```shell title="ssh root@xcat:/install/netboot/rocky8.6/x86_64/compute/rootimg" -{ - echo "#!/bin/bash" - echo "cd /" - find . |xargs getcap|awk -F= '{print "setcap" $2 " " $1}' -} > restorecap -chmod +x restorecap -mv restorecap /install/postscripts/restorecap -``` - -This command will create a `restorecap` script that you will need to add as postscript: - -```shell title="mystanzafile" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - kernelver=4.18.0-305.17.1.el8_4.x86_64 - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - postbootscripts=restorecap,git-configs-execute its-a-fake-password-dont-worry compute - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute -``` - -```shell title="ssh root@xcat" -cat mystanzafile | mkdef -z -``` - -::: - -## Node configuration - -```shell title="cn1.stanza" -cn1: - objtype=node - addkcmdline=modprobe.blacklist=nouveau crashkernel=256M - arch=x86_64 - bmc=10.10.3.51 - bmcpassword=password - bmcusername=admin - cons=ipmi - consoleenabled=1 - currstate=netboot rocky8.6-x86_64-compute - groups=compute,all - ip=192.168.0.51 - mac=18:c0:4d:b7:88:5f - mgt=ipmi - netboot=xnba - os=rocky8.6 - profile=compute - provmethod=rocky8.6-x86_64-netboot-compute - serialport=1 - serialspeed=115200 -``` - -Edit [accordingly](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/basic_concepts/xcat_object/node.html) and apply the stanza: - -```shell title="ssh root@xcat" -cat cn1.stanza | mkdef -z -``` - -Regenerate the DNS and DHCP configuration: - -```shell title="ssh root@xcat" -echo "reconfiguring hosts..." -makehosts -echo "reconfiguring dns..." -makedns -echo "reconfiguring dhcpd config..." -makedhcp -n -echo "reconfiguring dhcpd leases..." -makedhcp -a -``` - -And regenerate the PXE boot configuration: - -```shell title="ssh root@xcat" -nodeset osimage=rocky8.6-x86_64-netboot-compute -``` - -More details [here](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/references/man7/node.7.html). - -## Deploy - -```shell title="ssh root@xcat" -rpower cn1 on # or rpower cn1 reset -``` diff --git a/web/docs/guides/50-provisioning/03-gitops-with-grendel.mdx b/web/docs/guides/50-provisioning/03-gitops-with-grendel.mdx new file mode 100644 index 000000000..d6f8e800f --- /dev/null +++ b/web/docs/guides/50-provisioning/03-gitops-with-grendel.mdx @@ -0,0 +1,176 @@ +# GitOps with Grendel + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Postscript strategy + +If you've looked inside the Packer recipes, you can see that there is a systemd service that will run to fetch the postscript on Grendel: + +```shell title="Extract of ks.bare.cfg" +cat << 'END' >/pull-postscript.sh +#!/bin/sh +set -ex + +HOSTNAME="$(sed -E 's/^.*grendel.hostname=([^ ]*).*$/\1/' /proc/cmdline)" +hostnamectl set-hostname "${HOSTNAME}" + +GRENDEL_ADDRESS="$(sed -E 's/^.*grendel.address=([^ ]*).*$/\1/' /proc/cmdline)" + +curl -fsSL ${GRENDEL_ADDRESS}/repo/postscript.sh -o /postscript.sh +chmod +x /postscript.sh +/postscript.sh ${HOSTNAME} +END + +chmod +x /pull-postscript.sh + +cat <<'END' >/etc/systemd/system/grendel-postscript.service +[Unit] +Description=Grendel Postscript +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/pull-postscript.sh + +[Install] +WantedBy=multi-user.target +END +ln -s "/etc/systemd/system/grendel-postscript.service" "/etc/systemd/system/multi-user.target.wants/grendel-postscript.service" +``` + +The postscript is defined inside the Grendel configuration: + +```yaml title="helm/grendel/values-production.yaml" +config: + postscript: '' +``` + +The strategy to enable GitOps is the following: + +1. The systemd service pull the grendel postscript. +2. The grendel postscript fetches the ssh private key from the grendel HTTP server. +3. The grendel postscript `git clone` a repository containing other postscripts by using the ssh private key. +4. After cloning the repository, grendel executes the postscripts. + +## GitHub configuration + +### You first postscript tracked with Git + +Create a **private** empty repository for your scripts and add a `post.sh` script. + +This script is the main entry point. If you want to add a hierarchy, you use this script: + +```shell title="Example of postscript" +#!/bin/sh + +# Find all the executable scripts and sort them by name +scripts=$(find ./scripts -type f | sort) + +# Loop through each script and execute it +for script in $scripts; do + # Check if the script needs to be chmod-ed + if [ ! -x "$script" ]; then + chmod +x "$script" + fi + + # Execute the script + "./$script" +done +``` + +This script will execute all the files inside the `scripts` folder in alphabetical order. So you need to create a `scripts` folder with scripts inside. + +**Commit and push everything.** + +### Adding a deploy key + +Generate a key pair using: + +```shell +ssh-keygen -f $(pwd)/id_rsa -C grendel +``` + +[And add the `id_rsa.pub` as a deploy key.](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/managing-deploy-keys#set-up-deploy-keys) + +## Grendel configuration + +Let's add the private key to the Grendel HTTP server. + +### Private key secret + +Create a secret: + +```yaml title="argo/provisioning/secrets/postscript-privatekey-secret.yaml.local" +apiVersion: v1 +kind: Secret +metadata: + name: postscript-privatekey-secret + namespace: provisioning +type: Opaque +stringData: + ## Create the key with: + ## ssh-keygen -f $(pwd)/key -C grendel + key: '' +``` + +Seal it and apply it: + +```yaml title="user@local:/ClusterFactory" +cfctl kubeseal +kubectl apply -f argo/provisioning/secrets/postscript-privatekey-sealed-secret.yaml +``` + +### Mounting the private key + +In the Grendel values file, add: + +```yaml title="helm/grendel/values-production.yaml" +## Extra volumes +volumes: + - name: postscript-privatekey + secret: + defaultMode: 384 + secretName: postscript-privatekey-secret + +## Extra volume mounts +volumeMounts: + - name: postscript-privatekey + subPath: key + mountPath: /var/lib/grendel/key +``` + +This will mount the key inside the HTTP server. + +### Setup the Grendel postscript for GitOps + +In the Grendel values file, change the `postscript` field to: + +```yaml title="helm/grendel/values-production.yaml" +config: + postscript: | + #!/bin/sh + + set -ex + + # Fetch deploy key + curl --retry 5 -fsSL http://grendel.internal/repo/key -o /key + chmod 600 /key + + # Cloning git repo containing postscripts. + mkdir -p /configs + GIT_SSH_COMMAND='ssh -i /key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' git clone git@github.com:/.git /configs + if [ -f /configs/post.sh ] && [ -x /configs/post.sh ]; then + cd /configs || exit 1 + ./post.sh "$1" + fi + rm -f /key + + # Security + chmod -R g-rwx,o-rwx . +``` + +## Conclusion + +And that's it! With this, the node postscripts will be tracked on Git and you won't be lost in your node configuration. diff --git a/web/docs/guides/50-provisioning/04-gitops-with-xcat.mdx b/web/docs/guides/50-provisioning/04-gitops-with-xcat.mdx deleted file mode 100644 index 17f5e6c39..000000000 --- a/web/docs/guides/50-provisioning/04-gitops-with-xcat.mdx +++ /dev/null @@ -1,275 +0,0 @@ -# GitOps with xCAT - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -:::info - -In the next version of ClusterFactory, xCAT will be a Kubernetes operator. - -This means that the stanza file for the definition of the cluster can be written in YAML, and there will be no need to SSH to xCAT. - -::: - -## Stanza files as source of truth - -You can dump the database by using: - -```shell title="root@xcat" -lsdef -a -l -z > mydbstanzafile -# -a: all -# -l: long -# -z: stanza -``` - -You can apply a stanza by using: - -```shell title="root@xcat" -cat mydbstanzafile | mkdef -z -``` - -## Creating and using a postbootscript/cloud-init file to allow GitOps - -A post-boot script/cloud-init file is executed after the boot of the OS, thanks to SystemD. - -The strategy is the following: - -1. The postscripts/cloud-init file will accept one parameter: a private key. This key shouldn't be checked in Git. -2. The secret key will decrypt the SSH deploy key, which is stored inside the script. -3. With the SSH deploy key, the script will `git clone` the Git repository containing the configuration files -4. If the script `post.sh` exists in the repository, then we execute this file. -5. This file will copy files and executes other post-boot scripts. - -### Step 1: Setup the Git repository - -```shell title="user@local:/" -mkdir compute-configs -cd compute-configs -git init -``` - -Create a `post.sh` file. This is the entry point. You can do anything you want inside (copy or run other scripts). - -Here is an example of our script that we use daily for [DeepSquare](https://deepsquare.io): - -```shell title="user@local" -#!/bin/sh - -# RUN wraps the command to log into journalctl -RUN() { - logger -t postscripts "Running $*..." - "$@" - code=$? - if [ $code -eq 1 ]; then - logger -t postscripts "$* failed with error code $code" - elif [ $code -ne 0 ]; then - logger -t postscripts "$* exited with error code $code" - fi - logger -t postscripts "$* exited with code $code" -} - -COPY() { - mkdir -p "$(dirname "$2")" - rsync -av "$1" "$2" -} - -SCRIPTPATH=$(dirname "$(realpath "$0")") - -# -- SYNCLIST -cd "${SCRIPTPATH}/files" || (echo "cd failed" && exit 1) -COPY ./sssd/sssd.rocky.conf /etc/sssd/sssd.conf - -COPY ./munge/munge.key /etc/munge/munge.key - -# Slurm configless -COPY ./slurm/slurmd_defaults /etc/default/slurmd - -# Slurm -COPY ./systemd/slurmd.service /etc/systemd/system/slurmd.service -COPY ./enroot/00-default.conf /etc/enroot/enroot.conf.d/00-default.conf -COPY ./slurm/prolog.d/ /etc/slurm/prolog.d/ -COPY ./slurm/epilog.d/ /etc/slurm/epilog.d/ -COPY ./slurm/plugstack.rocky.conf.d/ /etc/slurm/plugstack.conf.d/ - -# CA -COPY ./certs/csquare.gcloud.pem /etc/pki/ca-trust/source/anchors/csquare.gcloud.pem -update-ca-trust -systemctl restart sssd - -# -- APPEND -cat ./slurmctl/keys/id_rsa.pub >>/root/.ssh/authorized_keys - -# Restore context -cd "${SCRIPTPATH}" || (echo "cd failed" && exit 1) - -# -- EXECUTE (use RUN to log your postscripts) -PATH="${SCRIPTPATH}/postscripts:$PATH" - -RUN ldap -RUN fs_mount -RUN slurm -RUN set-motd -``` - -The copied files are stored inside a `files/` directory and other scripts are stored inside a `postscripts/` directory. - -Like this: - -```shell -. -├── files -│   ├── certs -│   │   └── csquare.gcloud.pem -│   ├── enroot -│   │   └── 00-default.conf -│   ├── munge -│   │   └── munge.key -│   ├── slurm -│   │   ├── epilog.d -│   │   │   └── none.sh -│   │   ├── plugstack.rocky.conf.d -│   │   │   └── spank.conf -│   │   ├── prolog.d -│   │   │   └── 50-all-enroot-dirs -│   │   └── slurmd_defaults -│   ├── slurmctl -│   │   └── keys -│   │   ├── id_rsa -│   │   └── id_rsa.pub -│   ├── sssd -│   │   └── sssd.rocky.conf -│   └── systemd -│   └── slurmd.service -├── git-configs-execute.xcat-postbootscript.example -├── postscripts -│   ├── fs_mount -│   ├── ldap -│   ├── set-motd -│   └── slurm -└── post.sh -``` - -Commit and put it on GitHub as a private (or public if you feel safe) repository: - -```title="user@local:/compute-configs" -git add . -git commit -m "feat: initial commit" -git remote add origin https://github.com/user/repo.git -git branch -M main -git push -u origin main -``` - -### Step 2: Add a SSH deploy key to the GitHub repository - -```shell title="user@local" -ssh-keygen -t ed25519 -f key -``` - -Put the `key.pub` on [GitHub as a deploy key](https://docs.github.com/en/developers/overview/managing-deploy-keys#setup-2): - -![Deploy Key page](04-gitops-with-xcat.assets/deploy-key.png) - -### Step 3: Encrypt the SSH deploy private key - -```shell title="user@local" -openssl enc -aes-256-cbc -a -salt -pbkdf2 -in key -out key.enc -``` - -Save the password for the next step. - -### Step 4: Creating the post-boot script/cloud-init file - - - - -```shell title="git-config-execute.sh " -#!/bin/sh -# Params: -# 1: password for the ssh key - -set -x - -mkdir -p /configs - -# Encrypt -cat << EOF > /key.enc - -EOF -chmod 600 /key.enc -echo "$1" | openssl aes-256-cbc -d -a -pbkdf2 -in /key.enc -out /key -pass stdin -chmod 600 /key -GIT_SSH_COMMAND='ssh -i /key -o IdentitiesOnly=yes' git clone /configs -if [ -f /configs/post.sh ] && [ -x /configs/post.sh ]; then - cd /configs || exit 1 - ./post.sh -fi -rm -f /key /key.env - -# Security -chmod -R g-rwx,o-rwx . -``` - -On xCAT, you should add the post-boot script inside an `osimage` stanza: - -```shell title="stanzafile" -rocky8.6-x86_64-netboot-compute: - objtype=osimage - exlist=/xcatdata/install/rocky8.6/x86_64/Packages/compute.rocky8.x86_64.exlist - imagetype=linux - kernelver=4.18.0-305.17.1.el8_4.x86_64 - osarch=x86_64 - osname=Linux - osvers=rocky8.6 - permission=755 - postbootscripts=git-config-execute.sh - profile=compute - provmethod=netboot - pkgdir=/tmp - pkglist=/dev/null - rootimgdir=/install/netboot/rocky8.6/x86_64/compute -``` - -:::note - -Since we are doing GitOps, we do not need to use the xCAT provisioning system. Therefore, we set `pkgdir=/tmp` and `pkglist=/dev/null`. - -::: - -Since the stanza contains a secret, you should store it in a Secret management system like HashiCorp Vault or a Sealed Secrets. - - - - -```yaml -#cloud-config -write_files: - - content: | - - path: /key.enc - permissions: '0600' - -runcmd: - - [ - sh, - -c, - "echo '' | openssl aes-256-cbc -d -a -pbkdf2 -in /key.enc -out /key -pass stdin", - ] - - [chmod, '600', /key] - - [ - sh, - -c, - "mkdir -p /configs && GIT_SSH_COMMAND='ssh -i /key -o IdentitiesOnly=yes' git clone /configs", - ] - - [ - sh, - -c, - 'if [ -f /configs/post.sh ] && [ -x /configs/post.sh ]; then cd /configs && ./post.sh compute; fi', - ] - - [rm, -f, /key, /key.enc] - - [chmod, -R, 'g-rwx,o-rwx', '.'] -``` - -Since the `cloud-init` contains a secret, you should store it in a Secret management system like HashiCorp Vault or a Sealed Secrets. - - - diff --git a/web/docs/guides/50-provisioning/_category_.json b/web/docs/guides/50-provisioning/_category_.json index 543aaa17b..fa81acadd 100644 --- a/web/docs/guides/50-provisioning/_category_.json +++ b/web/docs/guides/50-provisioning/_category_.json @@ -1,3 +1,3 @@ { - "label": "Provisioning Stack" + "label": "Provisioning" } diff --git a/web/docs/guides/60-slurm/01-deploy-slurm.mdx b/web/docs/guides/60-slurm/01-deploy-slurm.mdx index 287e38129..27eb1822c 100644 --- a/web/docs/guides/60-slurm/01-deploy-slurm.mdx +++ b/web/docs/guides/60-slurm/01-deploy-slurm.mdx @@ -515,102 +515,16 @@ The service should be accessible at the address `slurm-cluster--db ## 4. Slurm Controller Deployment -### 4.a. Volumes - -We will use NFS. Feel free to use another type of storage. - - - - -```yaml title="argo/slurm-cluster/volumes/controller-state--nfs.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: controller-state--nfs - namespace: slurm-cluster - labels: - app: slurm-controller - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/slurmctl - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - # - -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/slurm-cluster/volumes/controller-state--nfs.yaml -``` - - - - -```yaml title="argo/slurm-cluster/volumes/controller-state--pv.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: controller-state--pv - namespace: slurm-cluster - labels: - app: slurm-controller - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 10Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/slurmctl - mountPermissions: '0775' - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/slurm-cluster/volumes/controller-state--pv.yaml -``` - -The label `app=slurm-controller` will be used by the PersistentVolumeClaim. - - - - -### 4.b. Values: Enable SLURM Controller +### 4.a. Values: Enable SLURM Controller Let's add the values to deploy a SLURM Controller. - - - ```yaml title="helm/slurm-cluster/values-.yaml" controller: enabled: true persistence: - storageClassName: 'controller-state--nfs' + storageClassName: 'dynamic-nfs' accessModes: ['ReadWriteOnce'] size: 10Gi @@ -627,39 +541,6 @@ controller: memory: '1Gi' ``` - - - -```yaml title="helm/slurm-cluster/values-.yaml" -controller: - enabled: true - - persistence: - storageClassName: '' - accessModes: ['ReadWriteOnce'] - size: 10Gi - selectorLabels: - app: slurm-controller - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- - - nodeSelector: - kubernetes.io/hostname: - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- - - resources: - requests: - cpu: '250m' - memory: '1Gi' - limits: - cpu: - memory: '1Gi' -``` - - - - Notice that `kubernetes.io/hostname` is used, this is because the slurm controller will be using the host network and we don't want to make the slurm controller move around. We might develop a HA setup in the future version of ClusterFactory. @@ -724,280 +605,28 @@ We have enabled `config-less` in the `slurm.conf`. We need to build an OS Image with Slurm Daemon installed. -Using the `packer-recipes` directory, we can create a recipe called `compute.my-cluster.json`: - -```json title="packer-recipes/rocky/compute.my-cluster.json" -{ - "variables": { - "boot_wait": "3s", - "disk_size": "50G", - "iso_checksum": "53a62a72881b931bdad6b13bcece7c3a2d4ca9c4a2f1e1a8029d081dd25ea61f", - "iso_url": "https://download.rockylinux.org/vault/rocky/8.4/isos/x86_64/Rocky-8.4-x86_64-boot.iso", - "memsize": "8192", - "numvcpus": "8" - }, - "builders": [ - { - "type": "qemu", - "accelerator": "kvm", - "communicator": "none", - "boot_command": [ - " ", - "inst.ks=http://{{ .HTTPIP }}:{{ .HTTPPort }}/ks.my-cluster.cfg ", - "inst.cmdline", - "" - ], - "boot_wait": "{{ user `boot_wait` }}", - "disk_size": "{{ user `disk_size` }}", - "iso_url": "{{ user `iso_url` }}", - "iso_checksum": "{{ user `iso_checksum` }}", - "headless": true, - "cpus": "{{ user `numvcpus` }}", - "memory": "{{ user `memsize` }}", - "vnc_bind_address": "0.0.0.0", - "shutdown_timeout": "3h", - "shutdown_timeout": "1h", - "qemuargs": [["-serial", "stdio"]] - } - ] -} -``` - -Create also the `ks.my-cluster.cfg` in the `http` directory: - -```shell title="packer-recipes/rocky/http/ks.my-cluster.cfg" -url --url="https://dl.rockylinux.org/pub/rocky/9.0/BaseOS/x86_64/os/" -# License agreement -eula --agreed -# Disable Initial Setup on first boot -firstboot --disable -# Poweroff after the install is finished -poweroff -# Firewall -firewall --disable -# Disable Initial Setup on first boot -firstboot --disable -ignoredisk --only-use=vda -# Use SSSD -authselect select sssd with-mkhomedir with-sudo -# System language -lang en_US.UTF-8 -# Keyboard layout -keyboard us -# Network information -network --bootproto=dhcp --device=eth0 -# SELinux configuration -selinux --disabled -# System timezone -timezone UTC --utc -# System bootloader configuration -bootloader --location=mbr --driveorder="vda" --timeout=1 -# Root password -rootpw --plaintext an_example_of_default_password -# System services -services --enabled="chronyd" - -repo --name="AppStream" --baseurl=https://dl.rockylinux.org/pub/rocky/9.0/AppStream/x86_64/os/ -repo --name="Extras" --baseurl=https://dl.rockylinux.org/pub/rocky/9.0/extras/x86_64/os/ -repo --name="CRB" --baseurl=https://dl.rockylinux.org/pub/rocky/9.0/CRB/x86_64/os/ -repo --name="epel" --baseurl=https://mirror.init7.net/fedora/epel/9/Everything/x86_64/ -repo --name="deepsquare" --baseurl=https://yum.deepsquare.run/9/ - -# Clear the Master Boot Record -zerombr -# Remove partitions -clearpart --all --initlabel -# Automatically create partition -part / --size=1 --grow --asprimary --fstype=xfs - -# Postinstall -%post --erroronfail -set -ex -mkdir /opt/xcat - -# Install xCat provisioning service -curl -fsSL "https://raw.githubusercontent.com/xcat2/xcat-core/master/xCAT/postscripts/xcatpostinit1.netboot" -o /opt/xcat/xcatpostinit1 -chmod 755 /opt/xcat/xcatpostinit1 - -curl -fsSL "https://raw.githubusercontent.com/xcat2/xcat-core/master/xCAT/postscripts/xcatpostinit1.service" -o /etc/systemd/system/xcatpostinit1.service -ln -s "../xcatpostinit1.service" "/etc/systemd/system/multi-user.target.wants/xcatpostinit1.service" - -# Postinstall - -#-- Pam mkhomedir: auto create home folder for ldap users -sed -Ei 's|UMASK\t+[0-9]+|UMASK\t\t027|g' /etc/login.defs - -#-- Secure umask for newly users -echo 'umask 0027' >> /etc/profile - -# Kickstart copies install boot options. Serial is turned on for logging with -# Packer which disables console output. Disable it so console output is shown -# during deployments -sed -i 's/^GRUB_TERMINAL=.*/GRUB_TERMINAL_OUTPUT="console"/g' /etc/default/grub -sed -i '/GRUB_SERIAL_COMMAND="serial"/d' /etc/default/grub -sed -ri 's/(GRUB_CMDLINE_LINUX=".*)\s+console=ttyS0(.*")/\1\2/' /etc/default/grub - -# Clean up install config not applicable to deployed environments. -for f in resolv.conf fstab; do - rm -f /etc/$f - touch /etc/$f - chown root:root /etc/$f - chmod 644 /etc/$f -done +Install these packages: -cat << EOF >>/etc/fstab -devpts /dev/pts devpts gid=5,mode=620 0 0 -tmpfs /dev/shm tmpfs defaults 0 0 -proc /proc proc defaults 0 0 -sysfs /sys sysfs defaults 0 0 -EOF - -rm -f /etc/sysconfig/network-scripts/ifcfg-[^lo]* - -dnf clean all -%end - -%packages -@minimal-environment -chrony - -# kernel -kernel-5.14.0-70.22.1.el9_0.x86_64 -kernel-devel-5.14.0-70.22.1.el9_0.x86_64 -kernel-headers-5.14.0-70.22.1.el9_0.x86_64 -kernel-tools-5.14.0-70.22.1.el9_0.x86_64 -kernel-modules-5.14.0-70.22.1.el9_0.x86_64 -kernel-core-5.14.0-70.22.1.el9_0.x86_64 -kernel-modules-extra-5.14.0-70.22.1.el9_0.x86_64 - -bash-completion -cloud-init -# cloud-init only requires python3-oauthlib with MAAS. As such upstream -# removed this dependency. -python3-oauthlib -rsync -tar - -# disk growing -cloud-utils-growpart - -# grub2-efi-x64 ships grub signed for UEFI secure boot. If grub2-efi-x64-modules -# is installed grub will be generated on deployment and unsigned which breaks -# UEFI secure boot. -grub2-efi-x64 -efibootmgr -shim-x64 -dosfstools -lvm2 -mdadm -device-mapper-multipath -iscsi-initiator-utils - -dnf-plugins-core - -# other packages -net-tools -nfs-utils -openssh-server -rsync -tar -util-linux -wget -python3 -tar -bzip2 -bc -dracut -dracut-network -rsyslog -hostname -e2fsprogs -ethtool -parted -openssl -dhclient -openssh-clients -bash -vim-minimal -rpm -iputils -perl-interpreter -gawk -xz -squashfs-tools -cpio -sudo -make -bash-completion -nano -pciutils -git -mlocate -sssd -vim-enhanced -systemd-udev -numactl -munge -libevent-devel -tmux -oddjob -oddjob-mkhomedir -redis -unzip -nmap -flex -tk -bison -libgfortran -tcl -gcc-gfortran -libcurl -libnl3-devel -python39 -numactl-libs -xfsprogs -zsh -#pkgconf-pkg-config -rpm-build -hwloc -hwloc-libs -hwloc-devel -tcsh -ksh -xorg-x11-fonts-ISO8859-1-75dpi.noarch -xorg-x11-fonts-cyrillic.noarch - -# otherpkgs -htop +``` pmix4 slurm slurm-contribs slurm-libpmi slurm-pam_slurm slurm-slurmd -# beeond build dependency -elfutils-libelf-devel - --plymouth -# Remove Intel wireless firmware --i*-firmware -%end +libnvidia-container1 +libnvidia-container-tools +enroot-hardened +enroot-hardened+caps +nvslurm-plugin-pyxis ``` -Build the image with: +from the [DeepSquare YUM repository](https://yum.deepsquare.run/yum.repo). -```shell title="user@local:/ClusterFactory/packer-recipes/rocky" -packer build compute.my-cluster.json -``` +### 5.b. Postscripts -And send the os image to xcat. [Follow the guide "Build an OS Image with Packer" for more details](/docs/guides/provisioning/packer-build). - -### 5.b. xCAT Postbootscripts - -Next, you have to configure a service by using a xCAT postscript. Our recommendation is to use a xCAT postscript to pull a Git repository which, based on the content of the repository, copies the files and executes the postscripts in that Git repository. - -This way, the GitOps practice is always followed and permits to adapt for the future version of ClusterFactory. +Next, you have to configure a service by using a postscript. The service: @@ -1011,7 +640,7 @@ Type=forking ExecStartPre=/usr/bin/id slurm Restart=always RestartSec=3 -ExecStart=/usr/sbin/slurmd -d /usr/sbin/slurmstepd --conf-server +ExecStart=/usr/sbin/slurmd -d /usr/sbin/slurmstepd --conf-server slurm-cluster--controller-0.example.com ExecReload=/bin/kill -HUP $MAINPID PIDFile=/var/run/slurmd.pid KillMode=process @@ -1023,10 +652,23 @@ LimitSTACK=infinity WantedBy=multi-user.target ``` +:::warning + +Add `slurm-cluster--controller-0.example.com` to the CoreDNS configuration. + +If you are using a hostPort, put the IP of the Kubernetes host hosting the pod. +If you are using a LoadBalancer, put the IP you've given to the LoadBalancer. +If you are using IPVLAN, put the IP you've given to the IPVLAN. + +::: + A simple postbootscript: ```shell title="sample-configure-slurm.sh #!/bin/sh -ex +# Copy the CA certificate created for the private-cluster-issuer +cp ./certs/my-ca.prem /etc/pki/ca-trust/source/anchors/my-ca.pem + mkdir -p /var/log/slurm/ cat <<\END | base64 -d >/etc/munge/munge.key @@ -1095,6 +737,25 @@ After setup SLURM, you should also: - Use the postscript to configure SSSD - Use the postscript to import the `munge.key` +:::warning Troubleshooting + +In the order: + +1. Check your journal (`journalctl`) and check the logs. +2. Stuck in a `id slurm` loop ? + - Check the SSSD configuration + - Check the TLS certificate + - Check if Traefik and LDAP ports (`nc -vz `) +3. SSSD is working (`id slurm` shows 1501 as UID), but `sinfo` is crashing + - Check the health of your SLURM controller pod + - Check if the ports of the SLURM controller (`nc -vz 6817`) + - Check if the domain name of the SLURM controller can be received (`dig @ slurm-cluster--controller-0.example.com`) + - Check the DNS client configuration (`/etc/resolv.conf`) +4. Slurm is crashing but not `sinfo` + - Check `/var/log/slurm/slurm.log` + +::: + ### 5.c. Reboot the nodes If the controller is running, the nodes should automatically receive the `slurm.conf` inside `/run/slurm/conf`. @@ -1315,11 +976,11 @@ A Kubernetes Service offers a lot of advantages while IPVLan offers a solution t It is **extremely** recommended to use a Kubernetes service to expose your connection node as it provides load balancing and is easy to configure. -| Kubernetes LoadBalancer Service | Multus CNI | -| ------------------------------------------------------------ | ------------------------------------------------------------ | +| Kubernetes LoadBalancer Service | Multus CNI | +| ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | A LoadBalancer service provides limited control over networking, as it only provides a single IP address for a Kubernetes service. | IPVLan with Multus allows you to have more fine-grained control over networking by enabling you to use multiple network interfaces in a pod, each with its own IP address and route table. | -| A LoadBalancer service is a simple and straightforward way to expose a Kubernetes service to the internet. | Setting up IPVLan with Multus can be more complex than using a simple LoadBalancer service, as it requires more configuration and setup time. | -| A LoadBalancer service can only expose a set of ports. | Using IPVLan with Multus will allow a pod to directly connect to the host network. | +| A LoadBalancer service is a simple and straightforward way to expose a Kubernetes service to the internet. | Setting up IPVLan with Multus can be more complex than using a simple LoadBalancer service, as it requires more configuration and setup time. | +| A LoadBalancer service can only expose a set of ports. | Using IPVLan with Multus will allow a pod to directly connect to the host network. | **As a result, using a Kubernetes LoadBalancer service will render the Slurm `srun` commands inoperable (although `sbatch` will work and is the preferred method for job submission). On the other hand, adopting Multus CNI eliminates the load balancing feature, but could lead to instability.** diff --git a/web/docs/guides/60-slurm/03-deploy-ondemand.md b/web/docs/guides/60-slurm/03-deploy-ondemand.md index fcb7329d4..bebe48b87 100644 --- a/web/docs/guides/60-slurm/03-deploy-ondemand.md +++ b/web/docs/guides/60-slurm/03-deploy-ondemand.md @@ -31,89 +31,6 @@ Deploying Open OnDemand is very similar to [deploying the Slurm Login node](/doc This is the same as the deployment of [the Slurm Login Nodes](/docs/guides/slurm/deploy-slurm#ssh-server-configuration) but with extra steps. -### Dex storage - -We also need to store the state of Dex. - -We will use NFS. Feel free to use another type of storage. - - - - -```yaml title="argo/slurm-cluster/volumes/dex-state-nfs.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: dex-state-nfs - namespace: slurm-cluster - labels: - app: open-ondemand - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/dex - mountPermissions: '0700' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/region - values: - - # - -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/slurm-cluster/volumes/dex-state-nfs.yaml -``` - - - - -```yaml title="argo/slurm-cluster/volumes/dex-state-pv.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: dex-state-pv - namespace: slurm-cluster - labels: - app: open-ondemand - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 10Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/dex - mountPermissions: '0700' - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/slurm-cluster/volumes/dex-state-pv.yaml -``` - - - - ### Open OnDemand secret configuration The configuration of Open OnDemand must be stored in a secret because it could leak the LDAP password: diff --git a/web/docs/guides/70-cvmfs/04-deploy-cvmfs.md b/web/docs/guides/70-cvmfs/04-deploy-cvmfs.md index 0a28839f7..fc5f9c1db 100644 --- a/web/docs/guides/70-cvmfs/04-deploy-cvmfs.md +++ b/web/docs/guides/70-cvmfs/04-deploy-cvmfs.md @@ -4,6 +4,18 @@ Let's assume we plan to replicate `http://cvmfs.example.com/cvmfs/repo.example.com`. +:::warning + +There is an issue with the Cgroups V2, a feature in recent linux kernels. + +The issue happens when a container image uses SystemD as the init system. + +When using a container image with SystemD, `/sys/fs/cgroup` must be mounted on the container. However, with Cgroups v2, the structure of this directory changed. + +Therefore, you MUST rollback to Cgroups v1 until SystemD can run with Cgroups v2. To rollback, add `systemd.unified_cgroup_hierarchy=0` to the kernel cmdline parameter. + +::: + ## Helm and Docker resources The Helm resources are stored on [ClusterFactory Git Repository](https://github.com/SquareFactory/ClusterFactory/tree/main/helm/cvmfs-server). @@ -65,7 +77,7 @@ Basically, `local-path-provisioner` creates the `/opt/local-path-provisioner` di To deploy the provisioner: ```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/default/apps/local-path-provisioner-app.yaml +kubectl apply -f argo/local-path-storage/apps/local-path-storage-app.yaml ``` The `StorageClass` `local-path` should be deployed. diff --git a/web/docs/guides/800-deploy-ldap.md b/web/docs/guides/800-deploy-ldap.md index 67646379f..83407adfe 100644 --- a/web/docs/guides/800-deploy-ldap.md +++ b/web/docs/guides/800-deploy-ldap.md @@ -21,91 +21,9 @@ docker pull docker.io/389ds/dirsrv:latest kubectl apply -f argo/ldap/ ``` -## 2. Persistent Volumes, Secrets and Ingresses +## 2. Secrets and Ingresses -### 2.a. Creating a `StorageClass` or `PersistentVolume` - -We will use NFS. Feel free to use another type of storage. We recommend at least 100 GB since the storage is used to store the root file system of the operating system images. - - - - -```yaml title="argo/ldap/volumes/storage-class.yaml" -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: 389ds-nfs - namespace: ldap - labels: - app: 389ds - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -provisioner: nfs.csi.k8s.io -parameters: - server: # IP or host - share: # /srv/nfs/k8s/389ds - mountPermissions: '0775' -mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime -volumeBindingMode: Immediate -reclaimPolicy: Retain -allowedTopologies: - - matchLabelExpressions: - - key: topology.kubernetes.io/zone - values: - - # -- -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/ldap/volumes/storage-class.yaml -``` - - - - -```yaml title="argo/ldap/volumes/persistent-volume.yaml" -apiVersion: v1 -kind: PersistentVolume -metadata: - name: 389ds-pv - namespace: ldap - labels: - app: 389ds - topology.kubernetes.io/region: # - - topology.kubernetes.io/zone: # -- -spec: - capacity: - storage: 100Gi - mountOptions: - - hard - - nfsvers=4.1 - - noatime - - nodiratime - csi: - driver: nfs.csi.k8s.io - readOnly: false - volumeHandle: # uuidgen - volumeAttributes: - server: # IP or host - share: # /srv/nfs/k8s/389ds - accessModes: - - ReadWriteOnce - persistentVolumeReclaimPolicy: Retain -``` - -```shell title="user@local:/ClusterFactory" -kubectl apply -f argo/ldap/volumes/persistent-volume.yaml -``` - -The label `app=ldap` will be used by the PersistentVolumeClaim. - - - - -### 2.b. Editing the environment variables with secrets +### 2.a. Editing the environment variables with secrets Take a look at the README of [389ds/dirsrv](https://hub.docker.com/r/389ds/dirsrv). @@ -141,7 +59,7 @@ kubectl apply -f argo/ldap/secrets/389ds-sealed-secret.yaml You can track `389ds-env-sealed-secret.yaml` in Git, but not the `-secret.yaml.local` file. -### 2.c. Creating an `IngressRouteTCP` to expose the LDAP server +### 2.b. Creating an `IngressRouteTCP` to expose the LDAP server You can expose the LDAP using Traefik `IngressRouteTCP`. @@ -314,28 +232,12 @@ Edit the `suffixName` according to your need. This is the path in LDAP where the ### 4.c. Mount the volume - - - -```yaml title="helm/389ds/values-production.yaml" -# ... -persistence: - storageClassName: '389ds-nfs' -``` - - - - ```yaml title="helm/389ds/values-production.yaml" # ... persistence: - selectorLabels: - app: 389ds + storageClassName: 'dynamic-nfs' ``` - - - ## 4. Deploy the app Commit and push: diff --git a/web/docs/guides/999-develop-apps.md b/web/docs/guides/999-develop-apps.md deleted file mode 100644 index 76f32dd64..000000000 --- a/web/docs/guides/999-develop-apps.md +++ /dev/null @@ -1,410 +0,0 @@ -# Develop Applications to integrate into ClusterFactory - -This guide covers the general process to develop an application for Cluster -Factory and might help you to integrate your applications. - -Let's take the example of xCAT, which is a complex bare-metal provisioning solution. - -## 1. Dockerize/Containerize the application - -All applications are containerizable. You will need to fetch a lot of -information to see how difficult it is. - -You should check for: - -- Dependencies: - - The base image (Ex: Rocky Linux) - - Build-time setup (enabling the services) - - Runtime dependencies (Ex: Perl, Apache, xCAT, ...) - - Runtime setup (the entry-point script) - - Init system (SystemD) - - And eventually, host dependencies -- Interfaces: - - Network: - - All the TCP and UDP ports (Ex: DHCP, SSH, ...) - - Host network (Ex: The DHCP server needs the host network to receive broadcast DHCP requests.) - - Volumes: - - Persistent Volumes (Ex: The xCAT databases.) - - Is it possible to set a read-only filesystem? -- Privileges - - Is it possible to run rootless? - - Is there any capabilities? (Ex: `NET_BIND_SERVICE`, ...) - -Knowing these details will make it easier to write a Dockerfile and test it. - -The xCAT Dockerfile: - -```dockerfile -FROM quay.io/rockylinux/rockylinux:8.4 - -LABEL MAINTAINER Square Factory - -ENV container docker - -ARG xcat_version=latest -ARG xcat_reporoot=https://xcat.org/files/xcat/repos/yum -ARG xcat_baseos=rh8 - -# Remove useless SystemD services -RUN (cd /lib/systemd/system/sysinit.target.wants/; \ - for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done); \ - rm -f /lib/systemd/system/multi-user.target.wants/* \ - && rm -f /etc/systemd/system/*.wants/* \ - && rm -f /lib/systemd/system/local-fs.target.wants/* \ - && rm -f /lib/systemd/system/sockets.target.wants/*udev* \ - && rm -f /lib/systemd/system/sockets.target.wants/*initctl* \ - && rm -f /lib/systemd/system/basic.target.wants:/* \ - && rm -f /lib/systemd/system/anaconda.target.wants/* - -# Setup symlink -RUN mkdir -p /xcatdata/etc/{dhcp,goconserver,xcat} && ln -sf -t /etc /xcatdata/etc/{dhcp,goconserver,xcat} && \ - mkdir -p /xcatdata/{install,tftpboot} && ln -sf -t / /xcatdata/{install,tftpboot} - -# Install dependencies -RUN dnf install -y -q wget which \ - && wget ${xcat_reporoot}/${xcat_version}/$([[ "devel" = "${xcat_version}" ]] && echo 'core-snap' || echo 'xcat-core')/xcat-core.repo -O /etc/yum.repos.d/xcat-core.repo \ - && wget ${xcat_reporoot}/${xcat_version}/xcat-dep/${xcat_baseos}/$(uname -m)/xcat-dep.repo -O /etc/yum.repos.d/xcat-dep.repo \ - && dnf install -y \ - xCAT \ - openssh-server \ - rsyslog \ - createrepo \ - chrony \ - initscripts \ - man \ - nano \ - pigz \ - bash-completion \ - vim \ - epel-release \ - && dnf install -y \ - screen \ - bind-utils \ - && dnf clean all - -# Setup SSH -RUN sed -i -e 's|#PermitRootLogin yes|PermitRootLogin yes|g' \ - -e 's|#Port 22|Port 2200|g' \ - -e 's|#UseDNS yes|UseDNS no|g' /etc/ssh/sshd_config \ - && echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ - && echo "root:cluster" | chpasswd \ - && rm -rf /root/.ssh \ - && mv /xcatdata /xcatdata.NEEDINIT - -# Enable services -RUN systemctl enable httpd \ - && systemctl enable sshd \ - && systemctl enable dhcpd \ - && systemctl enable rsyslog \ - && systemctl enable xcatd - -# Copy our edited genimage -COPY ./opt/xcat/share/xcat/netboot/rh/genimage /opt/xcat/share/xcat/netboot/rh/genimage - -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -COPY startup.sh /startup.sh -RUN chmod +x /startup.sh - -ENV XCATROOT /opt/xcat -ENV PATH="$XCATROOT/bin:$XCATROOT/sbin:$XCATROOT/share/xcat/tools:$PATH" MANPATH="$XCATROOT/share/man:$MANPATH" -VOLUME [ "/xcatdata", "/var/log/xcat" ] - -EXPOSE 3001/tcp 3001/udp \ - 3002/tcp 3002/udp \ - 7/udp \ - 873/tcp 873/udp \ - 53/tcp 53/udp \ - 67/tcp 67/udp \ - 68/tcp 68/udp \ - 69/tcp 69/udp \ - 111/udp \ - 514/tcp 514/udp \ - 4011/tcp \ - 623/tcp 623/udp \ - 2200/udp - -CMD [ "/startup.sh" ] -``` - -The `EXPOSE` declares which ports must be open for xCAT to be fully functional. - -The `VOLUME` declares which volumes need to be persistent. - -Other volumes can be mounted as read-only configurations. For example, since -we are running `systemd`, we need to mount the `/sys/fs/cgroup` directory. - -The entry point: - -```shell title="statup.sh" -#!/bin/bash - -setsid ./entrypoint.sh & - -exec /sbin/init -``` - -```shell title="entrypoint.sh" -#!/bin/bash -is_ubuntu=$(test -f /etc/debian_version && echo Y) -[[ -z ${is_ubuntu} ]] && logadm="root:" || logadm="syslog:adm" -chown -R ${logadm} /var/log/xcat/ -. /etc/profile.d/xcat.sh -ps -ax -if [[ -d "/xcatdata.NEEDINIT" ]]; then - echo "initializing xCAT ..." - if [ ! -f "/xcatdata/.init-finished" ]; then - echo "first initalization, copying template..." - rsync -a /xcatdata.NEEDINIT/ /xcatdata - - echo "initalizing database." - xcatconfig --database - - touch /xcatdata/.init-finished - fi - - echo "initializing networks table if necessary..." - xcatconfig --updateinstall - XCATBYPASS=1 tabdump site | grep domain || XCATBYPASS=1 chtab key=domain site.value=example.com - - if ! [ -L /root/.xcat ]; then - if ! [ -d /xcatdata/.xcat ]; then - echo "backup data not found, regenerating certificates and copying..." - xcatconfig -c - rsync -a /root/.xcat/* /xcatdata/.xcat - fi - echo "create symbol link for /root/.xcat..." - rm -rf /root/.xcat/ - ln -sf -t /root /xcatdata/.xcat - fi - - if [ -d /xcatdata/.ssh ]; then - echo "copy backup keys in /root/.ssh..." - rsync -a /xcatdata/.ssh/ /root/.ssh/ - chmod 600 /root/.ssh/* - else - echo "backup keys not found, copying keys to /xcatdata/.ssh..." - xcatconfig --sshkeys - mkdir -p /xcatdata/.ssh - rsync -a /root/.ssh/ /xcatdata/.ssh/ - chmod 600 /xcatdata/.ssh/* - fi - - echo "reconfiguring hosts..." - makehosts - echo "reconfiguring dns..." - makedns - echo "reconfiguring dhcpd config..." - makedhcp -n - echo "reconfiguring dhcpd leases..." - makedhcp -a - - echo "initializing loop devices..." - # workaround for no loop device could be used by copycds - for i in {0..7}; do - test -b /dev/loop$i || mknod /dev/loop$i -m0660 b 7 $i - done - # workaround for missing `switch_macmap` (#13) - ln -sf /opt/xcat/bin/xcatclient /opt/xcat/probe/subcmds/bin/switchprobe - mv /xcatdata.NEEDINIT /xcatdata.orig -fi - -cat /etc/motd -HOSTIPS=$(ip -o -4 addr show up | grep -v "\" | xargs -I{} expr {} : ".*inet \([0-9.]*\).*") -echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" -echo "welcome to Dockerized xCAT, please login with" -[[ -n "$HOSTIPS" ]] && for i in $HOSTIPS; do echo " ssh root@$i -p 2200 "; done && echo "The initial password is \"cluster\"" -echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" - -systemctl start xcatd -#exec /sbin/init -rm -f /etc/nologin /var/run/nologin -``` - -## 2. Testing the application with Podman - -Podman is an alternative to Docker. The main difference is that Podman is daemon-less. - -We will focus on one specific feature which is `podman-play-kube`. - -While you might test the container with Docker with `docker-compose` or with `Minikube`, Podman offers the almost same experience as a Kubernetes Cluster without being overkill. - -`podman-play-kube` only supports `Pod`, `Deployment`, `PersistentVolumeClaim` and `ConfigMap`, but that's enough since it bridges the gap between `docker-compose` and the Kubernetes syntax. - -Let's write a `Pod` for xCAT: - -```yaml title="pod.yaml" -apiVersion: v1 -kind: Pod -metadata: - name: 'xcat' - namespace: default - labels: - app: 'xcat' -spec: - hostNetwork: true - containers: - - name: xcat - image: 'xcat:latest' - imagePullPolicy: Never - securityContext: - capabilities: - add: - - CAP_SYS_ADMIN - - NET_ADMIN - readOnlyRootFilesystem: false - runAsNonRoot: false - runAsUser: 0 - resources: - limits: - cpu: 200m - memory: 500Mi - requests: - cpu: 100m - memory: 200Mi - ports: - - name: xcatdport-tcp - containerPort: 3001 - protocol: TCP - - name: xcatdport-udp - containerPort: 3001 - protocol: UDP - - name: xcatiport-tcp - containerPort: 3002 - protocol: TCP - - name: xcatiport-udp - containerPort: 3002 - protocol: UDP - - name: echo-udp - containerPort: 7 - protocol: UDP - - name: rsync-tcp - containerPort: 873 - protocol: TCP - - name: rsync-udp - containerPort: 873 - protocol: UDP - - name: domain-tcp - containerPort: 53 - protocol: TCP - - name: domain-udp - containerPort: 53 - protocol: UDP - - name: bootps - containerPort: 67 - protocol: UDP - - name: dhcp - containerPort: 67 - protocol: TCP - - name: dhcpc - containerPort: 68 - protocol: TCP - - name: bootpc - containerPort: 68 - protocol: UDP - - name: tftp-tcp - containerPort: 69 - protocol: TCP - - name: tftp-udp - containerPort: 69 - protocol: UDP - - name: www-tcp - containerPort: 80 - protocol: TCP - - name: www-udp - containerPort: 80 - protocol: UDP - - name: sunrpc-udp - containerPort: 111 - protocol: UDP - - name: rsyslogd-tcp - containerPort: 514 - protocol: TCP - - name: rsyslogd-udp - containerPort: 514 - protocol: UDP - - name: pxe - containerPort: 4011 - protocol: TCP - - name: ipmi-tcp - containerPort: 623 - protocol: TCP - - name: ipmi-udp - containerPort: 623 - protocol: UDP - - name: ssh-tcp - containerPort: 2200 - protocol: TCP - - name: ssh-udp - containerPort: 2200 - protocol: UDP - volumeMounts: - - name: xcatdata - mountPath: /xcatdata - - name: cgroup - mountPath: /sys/fs/cgroup - readOnly: true - - name: varlogxcat - mountPath: /var/log/xcat - - mountPath: /tmp - name: tmp - subPath: tmp - - mountPath: /run - name: tmp - subPath: run - - mountPath: /run/lock - name: tmp - subPath: run-lock - volumes: - - name: tmp - hostPath: - path: ./tmp - - name: varlogxcat - hostPath: - path: ./logs - - name: xcatdata - hostPath: - path: ./xcat - - name: cgroup - hostPath: - path: /sys/fs/cgroup - type: Directory - restartPolicy: Always -``` - -If we were to write a `docker-compose.yaml`, we would open the same port, mount the same volumes and add the same capabilities. - -The main advantage is that if it works with podman, it will certainly work with Kubernetes. - -One main disadvantage is that `podman-play-kube` doesn't support the use of `networks` (which means no support for macvlan and ipvlan). The issue is tracked [here](https://github.com/containers/podman/issues/12965). - -## 3. Writing a Helm application - -Although it is not necessary to write a Helm application, some values may be redundant or must be abstracted. - -That's why we prefer to write Helm Charts instead of Kustomize. If the application is light enough, we can use Kustomize instead. - -To write a Helm application, we need to generalize the values (by using -`example.com` as domain for example). The "overlay" values will be stored -either inside a Git repository, more precisely, inside a fork. - -The Helm application must be available on a publicly accessible Git or Helm repository. - -The example for xCAT is stored inside [`helm/xcat`](https://github.com/SquareFactory/ClusterFactory/tree/main/helm/xcat). - -## 4. Writing the Argo CD Application and custom values - -After writing the Helm Chart, you can write the Argo CD `Application`. - -The example for xCAT is stored inside [`argo/provisioning/apps`](https://github.com/SquareFactory/ClusterFactory/blob/main/argo.example/provisioning/apps/xcat-app.yaml). - -The custom values are stored inside the `helm/xcat` directory. If the Helm application is not a Git repository, it's better to use the [subchart pattern by using helm dependencies](https://github.com/argoproj/argocd-example-apps/blob/master/helm-dependency/README.md). - -## 5. Testing on the Kubernetes cluster - -Our favorite software for debugging is [Lens](https://k8slens.dev). - -If you have deployed Prometheus, we can see the CPU and Memory usage of the container. - -![image-20220517160622884](999-develop-apps.assets/image-20220517160622884.png) diff --git a/web/docs/main-concepts/04-apps/07-grendel.assets/grendel.drawio.svg b/web/docs/main-concepts/04-apps/07-grendel.assets/grendel.drawio.svg new file mode 100644 index 000000000..07ea346db --- /dev/null +++ b/web/docs/main-concepts/04-apps/07-grendel.assets/grendel.drawio.svg @@ -0,0 +1,4 @@ + + + +
Grendel
Grendel
dhcpd
dhcpd
tftpd
tftpd
httpd
httpd
Compute
Node
Compute...
OS
OS
postscripts
postscripts
SP
SP
Compute
Node
Compute...
OS
OS
postscripts
postscripts
SP
SP
Service Network (IPMI, ...)
Service Network (IPMI, ...)
Management Network
Management Network
ipmi client
ipmi client
Text is not SVG - cannot display
\ No newline at end of file diff --git a/web/docs/main-concepts/04-apps/07-xcat.assets/overlayfs.drawio.svg b/web/docs/main-concepts/04-apps/07-grendel.assets/overlayfs.drawio.svg similarity index 100% rename from web/docs/main-concepts/04-apps/07-xcat.assets/overlayfs.drawio.svg rename to web/docs/main-concepts/04-apps/07-grendel.assets/overlayfs.drawio.svg diff --git a/web/docs/main-concepts/04-apps/07-grendel.mdx b/web/docs/main-concepts/04-apps/07-grendel.mdx new file mode 100644 index 000000000..06fccdb9a --- /dev/null +++ b/web/docs/main-concepts/04-apps/07-grendel.mdx @@ -0,0 +1,72 @@ +# Grendel, the Bare-Metal Provisioner + +[Grendel](https://github.com/SquareFactory/grendel) is a solution used to deploy and manage HPC (High Performance Computing) clusters. It is designed to automate the process of configuring and installing software on compute nodes using a custom DHCP, DNS, TFTP, and HTTP server written in Go. + +## Architecture + +The architecture is the following: + +
+ +![grendel-arch](07-grendel.assets/grendel.drawio.svg#invert-on-dark) + +
+ +**Grendel** is deployed as a container and runs multiple network services responsible for the provisioning of bare-metal systems. + +**The Service Processor (SP)** controls the hardware and is used to perform out-of-band hardware control (e.g. Integrated Management Module (IMM), Flexible Service Processor (FSP), Baseboard Management Controller (BMC), etc). + +The Service Processor is connected to Grendel via **the Service Network.** + +**The Management Network** is used for OS provisioning (via PXE). + +## Why Grendel ? + +Our main criteria for choosing a bare metal provisioning solution is: + +- Operating system image management: provisioning methods, supported operating systems, ease of use. +- BMC configuration (IPMI, ...) +- Configuration management (declarative, post-boot scripts, backups, ...) + +While a lot of solutions exist for bare-metal provisioning like [OpenStack Ironic](https://wiki.openstack.org/wiki/Ironic) or [MAAS](https://maas.io/), only a few can do **disk-less provisioning**. + +Disk-less (or Stateless) provisioning is based on an OverlayFS root, with the OS being loaded from a SquashFS image. The OverlayFS is mounted as a `tmpfs`, that is, in the RAM. + +
+ +![upper-lower](07-grendel.assets/overlayfs.drawio.svg#invert-on-dark) + +
+ +Since the root is mounted in RAM, restarting a node will "clean it up", similar to a Docker container. + +With OverlayFS, we follow a proper DevOps practice where the architecture is immutable and mutable data is stored on disks. + +## How does it work ? + +### OS Image Building + +Grendel doesn't have an OS image builder. This is because we do not want the users to use a non-standard tool to build their OS images. + +Instead, we prefer to use [Packer](https://www.packer.io), mksquashfs and [Dracut](https://github.com/dracutdevs/dracut) to automate the building of OS images, linux kernels and linux initramfs. + +The steps to build disklessn OS image are the following: + +1. Use Packer to build an image that includes the operating system and any additional packages or customizations you require. This image will serve as the base for the diskless OS. +2. Mount the OS image, chroot in the OS image and execute dracut to generate initramfs. +3. Extract the kernel and initramfs +4. Use mksquashfs to create a squashfs filesystem image that will serve as the base layer of the overlayfs. The squashfs image should include any read-only files that will be shared across multiple diskless nodes. + +### Provisioning + +The steps to provision an OS image are the following: + +1. Enable network boot on the BIOS of the nodes. This will cause the node to broadcast a DHCP request for an IP address and PXE server. +2. The Grendel DHCP server will respond with an IP address based on the node's MAC address, and send the Network Boot Program (NBP) - in this case, the iPXE firmware. +3. The iPXE firmware will then download the Linux kernel and initramfs. +4. The Linux kernel options `rd.live.*` will be used to download the squashfs file from the Grendel server and mount the OS image as a /dev/loop device. +5. Once the base image is mounted, an overlayfs will be created using the loop device as the base image and an upper layer created using a tmpfs (RAM). This will provide a writeable layer for temporary files and system changes. + +### Postscript + +After initializing the operating system image, a SystemD service can be utilized to retrieve the postscript file stored on the Grendel HTTP server and execute it. diff --git a/web/docs/main-concepts/04-apps/07-xcat.assets/xcat.drawio.png b/web/docs/main-concepts/04-apps/07-xcat.assets/xcat.drawio.png deleted file mode 100644 index 01f297ab5..000000000 Binary files a/web/docs/main-concepts/04-apps/07-xcat.assets/xcat.drawio.png and /dev/null differ diff --git a/web/docs/main-concepts/04-apps/07-xcat.mdx b/web/docs/main-concepts/04-apps/07-xcat.mdx deleted file mode 100644 index 09fe4e4cf..000000000 --- a/web/docs/main-concepts/04-apps/07-xcat.mdx +++ /dev/null @@ -1,61 +0,0 @@ -# xCAT, the Bare-Metal Provisioner - -[xCAT](https://xcat.org) is complete solution for bare metal provisioning for -High-Performance Computing clusters. - -## Architecture - -The architecture is the following: - -
- -![Xcat-arch](07-xcat.assets/xcat.drawio.png#invert-on-dark) - -
- -**xCAT Management Node** is deployed as a container and runs multiple network services responsible for the provisioning of bare-metal systems. - -xCAT is also able to have "slaves" named **Service Node.** - -**The Service Processor (SP)** controls the hardware and is used to perform out-of-band hardware control (e.g. Integrated Management Module (IMM), Flexible Service Processor (FSP), Baseboard Management Controller (BMC), etc). - -The Service Processor is connected to xCAT via **the Service Network.** - -**The Management Network** is used for OS provisioning (via xNBA or PXE). - -## Why xCAT ? - -Our main criteria for choosing a bare metal provisioning solution is: - -- Operating system image management: provisioning methods, supported operating systems, ease of use. -- BMC configuration (IPMI, HMC, FSP, OpenBMC) -- Configuration management (Declarative, post-boot scripts, backups, ...) - -While a lot of solutions exist for bare-metal provisioning like [OpenStack Ironic](https://wiki.openstack.org/wiki/Ironic) or [MAAS](https://maas.io/), only a few are able to do **disk-less provisioning**. - -Disk-less (or Stateless) provisioning is based on an OverlayFS root, with the OS being loaded from a SquashFS image. The OverlayFS is mounted as a `tmpfs`, that is, in the RAM. - -
- -![upper-lower](07-xcat.assets/overlayfs.drawio.svg#invert-on-dark) - -
- -Since the root is mounted in RAM, restarting a node will "clean it up", similar to a Docker container. - -With OverlayFS, we follow a proper DevOps practice where the architecture is immutable and mutable data is stored on disks. - -Moreover, we are not only looking for a well-maintained open-source project, but also an easy-to-use solution. - -export const xCatAge = new Date(1999, 10, 31).getTime(); - - - We were interested in Grendel because it promises - to be better than xCAT, but it is still too young and its documentation is not yet perfect. xCAT - is {Math.abs(new Date(Date.now() - xCatAge).getUTCFullYear() - 1970)} years old, is still - maintained by iBM and has every functionality for bare metal provisioning. - - -While xCAT is not friendly to GitOps or Kubernetes, we plan to develop a way to configure xCAT declaratively from Kubernetes (probably by developing a [Kubernetes Operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/), or by applying the [stanza file](https://xcat-docs.readthedocs.io/en/stable/guides/admin-guides/references/man5/xcatstanzafile.5.html) at the boot of container). - -For now, the only way to use xCAT is to connect via SSH to your server.