diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..216de0c51 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +root = true + +[ansible/**] +charset = utf-8 +end_of_line = lf +indent_size = 2 +indent_style = space +insert_final_newline = true \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..ebc3e99f5 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,86 @@ +variables: + GIT_STRATEGY: clone + GIT_CLEAN_FLAGS: "-ffdx" + +stages: + - build + - deploy-stage-ansible + - deploy-stage-helm + - test-stage + - deploy-prod-nginx + - deploy-prod-helm + +include: + - component: $CI_SERVER_FQDN/rse/docker/images/ansible/ansible-lint@10.2.6 + inputs: + stage: build + dir: ansible + + - component: $CI_SERVER_FQDN/rse/docker/images/ansible/ansible-deploy@10.2.6 + inputs: + stage: deploy-stage-ansible + dir: ansible + inventory: gesis-stage + playbook: gesis.yml + ssh-user: ansible + ssh-key-type: ed25519 + +.gesis helm deploy: + image: + name: docker-private.gesis.intra/gesis/ilcm/orc2/k8s:latest + entrypoint: [""] + variables: + HELM_ENVIRONMENT: template + script: + - cat $git_crypt_secret_key | base64 -d > git_crypt_secret_key + - git-crypt unlock git_crypt_secret_key + - kubectl config use-context ${CI_PROJECT_PATH}:${HELM_ENVIRONMENT} + - helm version + - | + for d in ./mybinder*/; do + helm dependency update "$d"; + done + - | + for chart in mybinder-kube-system mybinder-tigera-operator; do + helm upgrade \ + ${chart:9} ./${chart} \ + --cleanup-on-fail \ + --create-namespace \ + --history-max 3 \ + --install \ + --namespace=${chart}; + done + - | + helm lint ./mybinder \ + --values ./config/gesis-${HELM_ENVIRONMENT}.yaml \ + --values ./secrets/config/common/common.yaml \ + --values ./secrets/config/common/cryptnono.yaml \ + --values ./secrets/config/common/gesis.yaml \ + --values ./secrets/config/gesis-${HELM_ENVIRONMENT}.yaml + - | + helm upgrade \ + binderhub ./mybinder \ + --cleanup-on-fail \ + --create-namespace \ + --history-max 3 \ + --install \ + --namespace=gesis \ + --render-subchart-notes \ + --values ./config/gesis-${HELM_ENVIRONMENT}.yaml \ + --values ./secrets/config/common/common.yaml \ + --values ./secrets/config/common/cryptnono.yaml \ + --values ./secrets/config/common/gesis.yaml \ + --values ./secrets/config/gesis-${HELM_ENVIRONMENT}.yaml + +gesis helm stage deploy: + resource_group: stage + stage: deploy-stage-helm + variables: + HELM_ENVIRONMENT: stage + extends: + - .gesis helm deploy + +smoke test to stage cluster: + stage: test-stage + script: + - curl https://notebooks-test.gesis.org/binder/ \ No newline at end of file diff --git a/.gitlab/agents/stage/config.yaml b/.gitlab/agents/stage/config.yaml new file mode 100644 index 000000000..59f3bb059 --- /dev/null +++ b/.gitlab/agents/stage/config.yaml @@ -0,0 +1,3 @@ +ci_access: + projects: + - id: methods-hub/interactive-environment \ No newline at end of file diff --git a/ansible/gesis.yml b/ansible/gesis.yml new file mode 100644 index 000000000..430e341b0 --- /dev/null +++ b/ansible/gesis.yml @@ -0,0 +1,35 @@ +- name: Configure servers that are part of Kubernetes cluster + hosts: all + gather_facts: false + become: true + roles: + - k8s-common +- name: Configure Kubernetes control panel + hosts: kubernetes_control_panel + gather_facts: false + become: true + roles: + - k8s-control-panel +- name: Configure Kubernetes workers + hosts: kubernetes_workers + gather_facts: false + become: true + roles: + - k8s-worker +- name: Configure Kubernetes Persistent Volumes + hosts: kubernetes_control_panel + gather_facts: false + become: true + roles: + - k8s-pv +- name: Configure JupyterHub workers + hosts: jupyterhub_single_user + gather_facts: false + become: true + roles: + - k8s-worker +- name: Configure mybinder.org Kubernetes cluster + hosts: kubernetes_control_panel + gather_facts: false + roles: + - mybinder diff --git a/ansible/inventories/gesis-stage b/ansible/inventories/gesis-stage new file mode 100644 index 000000000..8e600c8a0 --- /dev/null +++ b/ansible/inventories/gesis-stage @@ -0,0 +1,49 @@ +[all] +#svko-ilcm04 ansible_host=194.95.75.14 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_14 }}' +svko-css-backup-node ansible_host=194.95.75.20 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_20 }}' +svko-k8s-test01 ansible_host=194.95.75.21 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_21 }}' +svko-k8s-test02 ansible_host=194.95.75.22 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_22 }}' +svko-k8s-test03 ansible_host=194.95.75.23 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_23 }}' + +[all:vars] +INVENTORY_NAME=stage +K8S_CONTROL_PLANE_ENDPOINT=194.95.75.21 +K8S_CONTROL_PLANE_ALIAS=svko-k8s-test01 + +[notebooks_gesis_org] +svko-css-backup-node + +[kubernetes_control_panel] +svko-k8s-test01 + +[kubernetes_control_panel:vars] +GRAFANA_CAPACITY_STORAGE=2Gi +JUPYTERHUB_CAPACITY_STORAGE=2Gi +PROMETHEUS_CAPACITY_STORAGE=15Gi + +[kubernetes_workers] +#svko-ilcm04 +svko-css-backup-node +svko-k8s-test02 +svko-k8s-test03 + +[ingress] +svko-css-backup-node + +[harbor] +svko-css-backup-node + +[binderhub] +svko-k8s-test02 + +[jupyterhub] +svko-k8s-test02 + +[jupyterhub_single_user] +svko-k8s-test03 + +[prometheus] +svko-css-backup-node + +[grafana] +svko-css-backup-node diff --git a/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml b/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml new file mode 100644 index 000000000..cbe083dae --- /dev/null +++ b/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml @@ -0,0 +1,45 @@ +apiVersion: kubelet.config.k8s.io/v1beta1 +authentication: + anonymous: + enabled: false + webhook: + cacheTTL: 0s + enabled: true + x509: + clientCAFile: /etc/kubernetes/pki/ca.crt +authorization: + mode: Webhook + webhook: + cacheAuthorizedTTL: 0s + cacheUnauthorizedTTL: 0s +cgroupDriver: systemd +clusterDNS: + - 10.96.0.10 +clusterDomain: cluster.local +cpuManagerReconcilePeriod: 0s +evictionPressureTransitionPeriod: 0s +fileCheckFrequency: 0s +healthzBindAddress: 127.0.0.1 +healthzPort: 10248 +httpCheckFrequency: 0s +imageMinimumGCAge: 0s +kind: KubeletConfiguration +logging: + flushFrequency: 0 + options: + json: + infoBufferSize: "0" + verbosity: 0 +memorySwap: {} +nodeStatusReportFrequency: 0s +nodeStatusUpdateFrequency: 0s +resolvConf: /run/systemd/resolve/resolv.conf +rotateCertificates: true +runtimeRequestTimeout: 0s +shutdownGracePeriod: 0s +shutdownGracePeriodCriticalPods: 0s +staticPodPath: /etc/kubernetes/manifests +streamingConnectionIdleTimeout: 0s +syncFrequency: 0s +volumeStatsAggPeriod: 0s +maxPods: 500 diff --git a/ansible/roles/jupyterhub/tasks/main.yml b/ansible/roles/jupyterhub/tasks/main.yml new file mode 100644 index 000000000..1409bf70d --- /dev/null +++ b/ansible/roles/jupyterhub/tasks/main.yml @@ -0,0 +1,15 @@ +- name: Stop kubelet service + ansible.builtin.systemd: + name: kubelet + state: stopped +- name: Copy kubelet configuration + ansible.builtin.copy: + src: ../var/lib/kubelet/config.yaml + dest: /var/lib/kubelet/config.yaml + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Restarted kubelet service + ansible.builtin.systemd: + name: kubelet + state: restarted diff --git a/ansible/roles/k8s-common/files/etc/containerd/config.toml b/ansible/roles/k8s-common/files/etc/containerd/config.toml new file mode 100644 index 000000000..320b460aa --- /dev/null +++ b/ansible/roles/k8s-common/files/etc/containerd/config.toml @@ -0,0 +1,250 @@ +disabled_plugins = [] +imports = [] +oom_score = 0 +plugin_dir = "" +required_plugins = [] +root = "/orc2_data/containerd" +state = "/run/containerd" +temp = "" +version = 2 + +[cgroup] + path = "" + +[debug] + address = "" + format = "" + gid = 0 + level = "" + uid = 0 + +[grpc] + address = "/run/containerd/containerd.sock" + gid = 0 + max_recv_message_size = 16777216 + max_send_message_size = 16777216 + tcp_address = "" + tcp_tls_ca = "" + tcp_tls_cert = "" + tcp_tls_key = "" + uid = 0 + +[metrics] + address = "" + grpc_histogram = false + +[plugins] + + [plugins."io.containerd.gc.v1.scheduler"] + deletion_threshold = 0 + mutation_threshold = 100 + pause_threshold = 0.02 + schedule_delay = "0s" + startup_delay = "100ms" + + [plugins."io.containerd.grpc.v1.cri"] + device_ownership_from_security_context = false + disable_apparmor = false + disable_cgroup = false + disable_hugetlb_controller = true + disable_proc_mount = false + disable_tcp_service = true + enable_selinux = false + enable_tls_streaming = false + enable_unprivileged_icmp = false + enable_unprivileged_ports = false + ignore_image_defined_volumes = false + max_concurrent_downloads = 3 + max_container_log_line_size = 16384 + netns_mounts_under_state_dir = false + restrict_oom_score_adj = false + sandbox_image = "registry.k8s.io/pause:3.6" + selinux_category_range = 1024 + stats_collect_period = 10 + stream_idle_timeout = "4h0m0s" + stream_server_address = "127.0.0.1" + stream_server_port = "0" + systemd_cgroup = false + tolerate_missing_hugetlb_controller = true + unset_seccomp_profile = "" + + [plugins."io.containerd.grpc.v1.cri".cni] + bin_dir = "/opt/cni/bin" + conf_dir = "/etc/cni/net.d" + conf_template = "" + ip_pref = "" + max_conf_num = 1 + + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + disable_snapshot_annotations = true + discard_unpacked_layers = false + ignore_rdt_not_enabled_errors = false + no_pivot = false + snapshotter = "overlayfs" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "" + CriuImagePath = "" + CriuPath = "" + CriuWorkPath = "" + IoGid = 0 + IoUid = 0 + NoNewKeyring = false + NoPivotRoot = false + Root = "" + ShimCgroup = "" + SystemdCgroup = true + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".image_decryption] + key_model = "node" + + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "" + + [plugins."io.containerd.grpc.v1.cri".registry.auths] + + [plugins."io.containerd.grpc.v1.cri".registry.configs] + + [plugins."io.containerd.grpc.v1.cri".registry.headers] + + [plugins."io.containerd.grpc.v1.cri".registry.mirrors] + + [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming] + tls_cert_file = "" + tls_key_file = "" + + [plugins."io.containerd.internal.v1.opt"] + path = "/opt/containerd" + + [plugins."io.containerd.internal.v1.restart"] + interval = "10s" + + [plugins."io.containerd.internal.v1.tracing"] + sampling_ratio = 1.0 + service_name = "containerd" + + [plugins."io.containerd.metadata.v1.bolt"] + content_sharing_policy = "shared" + + [plugins."io.containerd.monitor.v1.cgroups"] + no_prometheus = false + + [plugins."io.containerd.runtime.v1.linux"] + no_shim = false + runtime = "runc" + runtime_root = "" + shim = "containerd-shim" + shim_debug = false + + [plugins."io.containerd.runtime.v2.task"] + platforms = ["linux/amd64"] + sched_core = false + + [plugins."io.containerd.service.v1.diff-service"] + default = ["walking"] + + [plugins."io.containerd.service.v1.tasks-service"] + rdt_config_file = "" + + [plugins."io.containerd.snapshotter.v1.aufs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.btrfs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.devmapper"] + async_remove = false + base_image_size = "" + discard_blocks = false + fs_options = "" + fs_type = "" + pool_name = "" + root_path = "" + + [plugins."io.containerd.snapshotter.v1.native"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.overlayfs"] + root_path = "" + upperdir_label = false + + [plugins."io.containerd.snapshotter.v1.zfs"] + root_path = "" + + [plugins."io.containerd.tracing.processor.v1.otlp"] + endpoint = "" + insecure = false + protocol = "" + +[proxy_plugins] + +[stream_processors] + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"] + accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar" + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"] + accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar+gzip" + +[timeouts] + "io.containerd.timeout.bolt.open" = "0s" + "io.containerd.timeout.shim.cleanup" = "5s" + "io.containerd.timeout.shim.load" = "5s" + "io.containerd.timeout.shim.shutdown" = "3s" + "io.containerd.timeout.task.state" = "2s" + +[ttrpc] + address = "" + gid = 0 + uid = 0 diff --git a/ansible/roles/k8s-common/tasks/main.yml b/ansible/roles/k8s-common/tasks/main.yml new file mode 100644 index 000000000..3f48fa4e3 --- /dev/null +++ b/ansible/roles/k8s-common/tasks/main.yml @@ -0,0 +1,160 @@ +- name: Create directory /etc/apt/keyrings if it does not exist + ansible.builtin.file: + state: directory + path: /etc/apt/keyrings + owner: root + group: root + mode: u=rwx,g=rx,o=rx +- name: Remove old Kubernetes public GPG key + ansible.builtin.file: + path: /etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg + state: absent +- name: Remove old Kubernetes public GPG key + ansible.builtin.file: + path: /etc/apt/trusted.gpg.d/kubernetes-archive-keyring.asc + state: absent +- name: Remove old Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" + filename: kubernetes + state: absent +- name: Remove old Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/kubernetes.asc] https://pkgs.k8s.io/core:/stable:/v1.27/deb/ /" + filename: kubernetes + state: absent +- name: Ensure DOCKER_CLIENT_TIMEOUT is set + ansible.builtin.lineinfile: + path: /etc/environment + regexp: '^DOCKER_CLIENT_TIMEOUT=' + line: DOCKER_CLIENT_TIMEOUT=180 +- name: Disable SWAP since kubernetes can't work with swap enabled + ansible.builtin.command: swapoff -a + changed_when: false +- name: Disable SWAP in fstab since kubernetes can't work with swap enabled + ansible.builtin.replace: + path: /etc/fstab + regexp: '^([^#].*?\sswap\s+sw\s+.*)$' + replace: '# \1' +- name: Disable Firewall + ansible.builtin.command: ufw disable + changed_when: false +- name: Allow IP forward + ansible.posix.sysctl: + name: net.ipv4.ip_forward + value: '1' + state: present +- name: Set inotify max user instances + ansible.posix.sysctl: + name: fs.inotify.max_user_instances + value: '1280' + state: present +- name: Set inotify max user watches + ansible.posix.sysctl: + name: fs.inotify.max_user_watches + value: '655360' + state: present +- name: Create directory for Persistent Volume + ansible.builtin.import_tasks: + file: pv.yml +- name: Add Docker public GPG key + ansible.builtin.get_url: + url: https://download.docker.com/linux/ubuntu/gpg + dest: /etc/apt/trusted.gpg.d/docker.asc + mode: '0644' + force: true +- name: Add Docker repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/docker.asc] https://download.docker.com/linux/ubuntu jammy stable" + filename: docker + state: present +- name: Download Kubernetes public GPG key + ansible.builtin.get_url: + url: https://pkgs.k8s.io/core:/stable:/v1.27/deb/Release.key + dest: /tmp/kubernetes-archive-keyring.asc + mode: '0644' + force: true +- name: Convert the public GPG key to binary + ansible.builtin.command: + argv: + - gpg + - --yes + - --dearmor + - --output + - /tmp/kubernetes.gpg + - /tmp/kubernetes-archive-keyring.asc + changed_when: false +- name: Copy GPG key + ansible.builtin.copy: + src: /tmp/kubernetes.gpg + dest: /etc/apt/keyrings/kubernetes.gpg + remote_src: true + mode: '0644' +- name: Add Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/kubernetes.gpg] https://pkgs.k8s.io/core:/stable:/v1.27/deb/ /" + filename: kubernetes + state: present +- name: Add Helm public GPG key + ansible.builtin.get_url: + url: https://baltocdn.com/helm/signing.asc + dest: /etc/apt/trusted.gpg.d/helm.asc + mode: '0644' + force: true +- name: Add Helm repository + ansible.builtin.apt_repository: + repo: "deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/helm.asc] https://baltocdn.com/helm/stable/debian/ all main" + filename: kubernetes + state: present +- name: Install dependencies + ansible.builtin.apt: + update_cache: true + pkg: + - rsync + - python3 + - python3-kubernetes + - python3-invoke + - python3-fabric + - apt-transport-https + - ca-certificates + - curl + - containerd.io=1.7.* + - kubelet=1.28.* + - kubeadm=1.28.* + - kubectl=1.28.* + - helm=3.15.* +- name: Copy containerd configuration file + ansible.builtin.copy: + src: files/etc/containerd/config.toml + dest: /etc/containerd/config.toml + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Reload service containerd + ansible.builtin.systemd: + name: containerd + state: restarted +- name: Enable service containerd + ansible.builtin.systemd: + name: containerd + enabled: true + masked: false +- name: Modify kernel module overlay + ansible.builtin.command: modprobe overlay + changed_when: false +- name: Modify kernel module br_netfilter + ansible.builtin.command: modprobe br_netfilter + changed_when: false +- name: Create file for list of kernel modules required by containerd + ansible.builtin.file: + path: "/etc/modules-load.d/containerd.conf" + state: "touch" + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Populate list of kernel modules required by containerd + ansible.builtin.blockinfile: + path: "/etc/modules-load.d/containerd.conf" + block: | + overlay + br_netfilter diff --git a/ansible/roles/k8s-common/tasks/pv.yml b/ansible/roles/k8s-common/tasks/pv.yml new file mode 100644 index 000000000..6a11ee11e --- /dev/null +++ b/ansible/roles/k8s-common/tasks/pv.yml @@ -0,0 +1,28 @@ +- name: Create persistent directories in /orc2_data if it does not exist + ansible.builtin.file: + path: "/orc2_data/{{ item }}" + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx + loop: + - jupyterhub + - containerd + - repo2docker + - prometheus + - grafana + - alertmanager + +- name: Create persistent directories in /harbor/ if it does not exist + ansible.builtin.file: + path: "/harbor/{{ item }}" + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx + loop: + - jobservice + - registry + - redis + - trivy + - database diff --git a/ansible/roles/k8s-control-panel/files/calico/custom-resources.yaml b/ansible/roles/k8s-control-panel/files/calico/custom-resources.yaml new file mode 100644 index 000000000..4a90b5bc2 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/calico/custom-resources.yaml @@ -0,0 +1,24 @@ +# This section includes base Calico installation configuration. +# For more information, see: https://projectcalico.docs.tigera.io/master/reference/installation/api#operator.tigera.io/v1.Installation +apiVersion: operator.tigera.io/v1 +kind: Installation +metadata: + name: default +spec: + # Configures Calico networking. + calicoNetwork: + # Note: The ipPools section cannot be modified post-install. + ipPools: + - blockSize: 26 + cidr: 10.244.0.0/16 + encapsulation: VXLANCrossSubnet + natOutgoing: Enabled + nodeSelector: all() +--- +# This section configures the Calico API server. +# For more information, see: https://projectcalico.docs.tigera.io/master/reference/installation/api#operator.tigera.io/v1.APIServer +apiVersion: operator.tigera.io/v1 +kind: APIServer +metadata: + name: default +spec: {} diff --git a/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py b/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py new file mode 100644 index 000000000..e52f7d828 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py @@ -0,0 +1,93 @@ +"""Kill pods in Kubernetes cluster after timeout""" + +import argparse +import logging +import datetime + +from kubernetes import client, config + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("kill-after-timeout-pods") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" +BINDER_TIME_OUT = 6 # hours + + +def get_timed_out_pods(): + """Get list of all timed out pods that are single user running pod""" + time_now = datetime.datetime.now(datetime.timezone.utc) + all_timed_out_pods = [] + + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + pod_run_time = time_now - pod.metadata.creation_timestamp + pod_run_time_in_hours = pod_run_time.total_seconds() / 3600 + logger.debug( + "Pod %s (%s) is running for %.1f hours.", + pod.metadata.name, + pod.status.phase, + pod_run_time_in_hours, + ) + if ( + pod.metadata.name.startswith("jupyter-") + and pod_run_time_in_hours > BINDER_TIME_OUT + ): + all_timed_out_pods.append(pod) + logger.debug("Pod %s added to the list.", pod.metadata.name) + + return all_timed_out_pods + + +def kill_pod(pod): + """Kill single pod""" + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod.metadata.name, exception + ) + + +def kill_timed_out_pods(): + """Kill timed out pods""" + logger.info("Starting inspection of Kubernetes pod ...") + all_timed_out_pods = get_timed_out_pods() + for timed_out_pod in all_timed_out_pods: + kill_pod(timed_out_pod) + logger.info("%s pods deleted.", len(all_timed_out_pods)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Kill Timed Out Pods Cron Job", + description="Cron job to kill Kubernetes pods that timed out", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + kill_timed_out_pods() diff --git a/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py b/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py new file mode 100644 index 000000000..deb909c1d --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py @@ -0,0 +1,79 @@ +"""Kill succeeded pods in Kubernetes cluster""" + +import argparse +import logging + +from kubernetes import client, config + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("kill-succeeded-pods") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" + + +def get_succeeded_pods(): + """Get list of all succeeded pods that are single user running pod""" + all_succeeded_pods = [] + + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is %s", pod.metadata.name, pod.status.phase) + if pod.status.phase == "Succeeded" and pod.metadata.name.startswith("jupyter-"): + all_succeeded_pods.append(pod) + + return all_succeeded_pods + + +def kill_pod(pod): + """Kill single pod""" + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod.metadata.name, exception + ) + + +def kill_succeeded_pods(): + """Kill succeeded pods""" + logger.info("Starting inspection of Kubernetes pod ...") + all_succeeded_pods = get_succeeded_pods() + for succeeded_pod in all_succeeded_pods: + kill_pod(succeeded_pod) + logger.info("%s pods deleted.", len(all_succeeded_pods)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Kill Succeeded Pods Cron Job", + description="Cron job to kill Kubernetes pods in Succeeded status that are very old", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + kill_succeeded_pods() diff --git a/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service new file mode 100644 index 000000000..b3f183dd2 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service @@ -0,0 +1,14 @@ +[Unit] +Description=Bot service to restart ORC2 Docker-in-Docker when is not working +After=kubelet.service +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always +RestartSec=1 +User=ansible +{% for host in hostvars %} +Environment="PASSWORD_{{ hostvars[host]['ansible_host'] | replace(".", "_") }}={{ hostvars[host]['ansible_become_pass'] }}" +{% endfor %} +ExecStart=/usr/bin/python3 /usr/bin/orc2-fix-dind-bot.py --verbose diff --git a/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service new file mode 100644 index 000000000..19c08eb99 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service @@ -0,0 +1,11 @@ +[Unit] +Description=Bot service to restart ORC2 JupyterHub when API is not working +After=kubelet.service +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always +RestartSec=1 +User=ansible +ExecStart=/usr/bin/python3 /usr/bin/orc2-fix-jupyterhub-bot.py --verbose diff --git a/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py new file mode 100644 index 000000000..daf580ae7 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py @@ -0,0 +1,146 @@ +"""Script to identify when Docker-in-Docker stop working.""" + +import argparse +import datetime +import logging +import os + +from kubernetes import client, config, watch + +from invoke import Responder +from fabric import Connection + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("orc2-fix-dind-bot") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" + + +def remove_docker_socket(host_IP): + """Remove Docker socket""" + ssh_password = os.getenv(f"PASSWORD_{host_IP.replace('.', '_')}") + + logger.info("Connecting to %s ...", host_IP) + c = Connection(host_IP, user="ansible", connect_kwargs={"password": ssh_password}) + logger.info("Connected!", host_IP) + + logger.info("Removing Docker socket ...") + sudopass = Responder( + pattern=r"\[sudo\] password for .*:", + response=f"{ssh_password}\n", + ) + c.run("sudo rm -rf /var/run/dind/docker.sock/", pty=True, watchers=[sudopass]) + logger.info("Removed Docker socket.") + + +def remove_pods(): + """Remove Docker-in-Docker related pods""" + logger.debug("Starting search for pods ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith( + "binderhub-dind-" + ) or pod.metadata.name.startswith("binderhub-image-cleaner-"): + logger.info("Found pod %s", pod.metadata.name) + pod_to_delete_name = pod.metadata.name + logger.info("Requesting delete of pod %s ...", pod_to_delete_name) + try: + api_response = v1.delete_namespaced_pod(pod_to_delete_name, NAMESPACE) + logger.info("Pod %s deleted.", pod_to_delete_name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod_to_delete_name, exception + ) + logger.debug("Completed search for pods!") + + +def get_node_running_pod(pod_name): + """Get node host's IP address running pod""" + pod_status = v1.read_namespaced_pod(pod_name, namespace=NAMESPACE) + logger.debug(pod_status) + host_IP = pod_status.status.host_ip + logger.info("%s is running on %s", pod_name, host_IP) + return host_IP + + +def monitor_cluster(): + """Monitor pod""" + while True: + logger.info("Start monitoring ...") + + w = watch.Watch() + for event in w.stream(v1.list_namespaced_event, namespace=NAMESPACE): + pod_name = event["object"].involved_object.name + if pod_name.startswith("binderhub-dind-"): + if event["object"].type == "Warning": + logger.info("Found Warning event in %s", pod_name) + if event["object"].reason == "BackOff": + time_since_last_timestamp = ( + datetime.datetime.now(datetime.timezone.utc) + - event["object"].last_timestamp + ) + + if time_since_last_timestamp.seconds > 5: + logger.info( + "Skipping because event old (%d > 5 seconds).", + time_since_last_timestamp.seconds, + ) + else: + logger.info("Removing Docker-in-Docker socket and pods ...") + try: + node_IP_address = get_node_running_pod(pod_name) + remove_docker_socket(node_IP_address) + remove_pods() + except Exception as exception: + logger.info( + "Fail to delete pod %s due %s", pod_name, exception + ) + + elif event["object"].type == "Normal": + logger.debug( + "Found Normal event in %s ... skipping!", + event["object"].metadata.name, + ) + else: + logger.debug( + "Found %s event in %s ... ignoring!", + event["object"].type, + ["object"].metadata.name, + ) + + logger.info("Stop monitoring!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Fix JupyterHub Bot", + description="Monitoring Kubernetes cluster to restart JupyterHub", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + monitor_cluster() diff --git a/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py new file mode 100644 index 000000000..97c9e22fd --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py @@ -0,0 +1,111 @@ +"""Script to identify when JupyterHub stop working.""" + +import argparse +import datetime +import logging + +from kubernetes import client, config, watch + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("orc2-fix-jupyterhub-bot") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" +RESTART_WAITING_TIME = 120 # seconds + + +def get_binder_pod(): + """Get name of pod running Binder.""" + logger.debug("Starting search for BinderHub pod ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith("binder-"): + logger.info("Found BinderHub pod: %s", pod.metadata.name) + binder_pod_name = pod.metadata.name + break + + logger.debug("Search for BinderHub pod stop.") + return binder_pod_name + + +def kill_jupyterhub_pod(): + """Kill all JupyterHub pods""" + logger.debug("Starting search for JupyterHub pod ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith("hub-"): + logger.info("Found JupyterHub pod: %s", pod.metadata.name) + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod.metadata.name, exception + ) + logger.debug("Search for JupyterHub pod stop.") + + +def monitor_pod(): + """Monitor pod""" + while True: + pod_name = get_binder_pod() + logger.info("Monitoring %s", pod_name) + + last_jupyterhub_restart = datetime.datetime.now(datetime.timezone.utc) + + w = watch.Watch() + for line in w.stream( + v1.read_namespaced_pod_log, name=pod_name, namespace=NAMESPACE, tail_lines=0 + ): + if line.find("Error accessing Hub API") > -1: + logger.debug(line) + + now = datetime.datetime.now(datetime.timezone.utc) + time_difference = now - last_jupyterhub_restart + if time_difference.seconds > RESTART_WAITING_TIME: + logger.info("Restarting JupyterHub ...") + kill_jupyterhub_pod() + last_jupyterhub_restart = now + else: + logger.info( + "Waiting %s seconds for JupyterHub to restart.", + RESTART_WAITING_TIME, + ) + + logger.info("Stop monitoring %s", pod_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Fix JupyterHub Bot", + description="Monitoring Kubernetes cluster to restart JupyterHub", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + monitor_pod() diff --git a/ansible/roles/k8s-control-panel/tasks/main.yml b/ansible/roles/k8s-control-panel/tasks/main.yml new file mode 100644 index 000000000..c4f30f0c4 --- /dev/null +++ b/ansible/roles/k8s-control-panel/tasks/main.yml @@ -0,0 +1,179 @@ +- name: Check if Kubernetes is running + ansible.builtin.shell: > + kubectl get nodes || /bin/true + changed_when: false + register: kubernetes_nodes +- name: Pull kubernetes images + when: kubernetes_nodes.stdout.find('control-plane') == -1 + ansible.builtin.shell: > + kubeadm config images pull + --cri-socket unix:///run/containerd/containerd.sock + changed_when: false +- name: Initialize the cluster + when: kubernetes_nodes.stdout.find('control-plane') == -1 + ansible.builtin.shell: > + kubeadm init + --pod-network-cidr=10.244.0.0/16 + --upload-certs + --control-plane-endpoint={{ K8S_CONTROL_PLANE_ENDPOINT }} + --cri-socket unix:///run/containerd/containerd.sock + changed_when: false + register: kubeadm_init_output +- name: Create root's .kube directory + ansible.builtin.file: + path: /root/.kube + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx +- name: Copies admin.conf to root's kube config + ansible.builtin.copy: + src: /etc/kubernetes/admin.conf + dest: /root/.kube/config + remote_src: true + owner: root + group: root + mode: u=rw,g=r,o= +- name: Create user's .kube directory + ansible.builtin.file: + path: /home/ansible/.kube + state: directory + mode: u=rwx,g=rx,o=rx + owner: ansible + group: ansible +- name: Copies admin.conf to user's kube config + ansible.builtin.copy: + src: /etc/kubernetes/admin.conf + dest: /home/ansible/.kube/config + remote_src: true + owner: ansible + group: ansible + mode: u=rw,g=r,o= +- name: Get the token for joining the worker nodes + ansible.builtin.shell: > + kubeadm token create --print-join-command + changed_when: false + register: kubernetes_join_command +- name: Create temporary file + ansible.builtin.file: + path: /tmp/kubernetes_join_command + state: touch + owner: ansible + group: ansible + mode: u=rw,g=r,o= +- name: Save content of join command + ansible.builtin.copy: + content: | + #!/bin/sh + {{ kubernetes_join_command.stdout }} + dest: /tmp/kubernetes_join_command + owner: ansible + group: ansible + mode: u=rw,g=r,o= +- name: Copy join command to local file + ansible.builtin.fetch: + src: /tmp/kubernetes_join_command + dest: '{{ ANSIBLE_CONTROL_NODE_TMP }}' +- name: Remove Container Network Interface (CNI) Flannel + kubernetes.core.k8s: + state: absent + src: https://github.com/coreos/flannel/raw/master/Documentation/kube-flannel.yml +- name: Install Container Network Interface (CNI) Tigera Calico operator + kubernetes.core.k8s: + state: present + src: https://raw.githubusercontent.com/projectcalico/calico/v3.26.3/manifests/tigera-operator.yaml +- name: Install Calico and resource + kubernetes.core.k8s: + state: present + definition: "{{ lookup('file', '{{ role_path }}/files/calico/custom-resources.yaml') | from_yaml_all }}" +- name: Install Cert Manager + kubernetes.core.k8s: + state: present + src: https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.crds.yaml +- name: Add GitLab Helm repository + kubernetes.core.helm_repository: + name: gitlab + repo_url: https://charts.gitlab.io +- name: Deploy GitLab agent + kubernetes.core.helm: + name: gitlab-agent + chart_ref: gitlab/gitlab-agent + release_namespace: gitlab-agent + dependency_update: true + create_namespace: true + set_values: + - value: 'config.token={{ GITLAB_K8S_TOKEN }}' + - value: config.kasAddress=wss://git.gesis.org/-/kubernetes-agent/ +- name: Copy orc2-fix-jupyterhub-bot Python script + ansible.builtin.copy: + src: files/usr/bin/orc2-fix-jupyterhub-bot.py + dest: /usr/bin/orc2-fix-jupyterhub-bot.py + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Copy orc2-fix-jupyterhub-bot Systemd Unit script + ansible.builtin.copy: + src: files/etc/systemd/system/orc2-fix-jupyterhub-bot.service + dest: /etc/systemd/system/orc2-fix-jupyterhub-bot.service + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Enable service orc2-fix-jupyterhub-bot + ansible.builtin.systemd: + name: orc2-fix-jupyterhub-bot + daemon_reload: true + enabled: true + masked: false + state: restarted +- name: Copy orc2-fix-dind-bot Python script + ansible.builtin.copy: + src: files/usr/bin/orc2-fix-dind-bot.py + dest: /usr/bin/orc2-fix-dind-bot.py + owner: root + group: root + mode: u=rwx,g=rwx,o=rx +- name: Copy orc2-fix-dind-bot Systemd Unit script + ansible.builtin.template: + src: files/etc/systemd/system/orc2-fix-dind-bot.service + dest: /etc/systemd/system/orc2-fix-dind-bot.service + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Enable service orc2-fix-dind-bot + ansible.builtin.systemd: + name: orc2-fix-dind-bot + daemon_reload: true + enabled: true + masked: false + state: restarted +- name: Create directory + ansible.builtin.file: + state: directory + path: /home/ansible/bin + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=rx +- name: Copy kill-succeeded-pods.py + ansible.builtin.copy: + src: files/cron/kill-succeeded-pods.py + dest: /home/ansible/bin/kill-succeeded-pods.py + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=r +- name: Add cron job to remove succeeded pods + ansible.builtin.cron: + name: "remove succeeded" + job: "python3 /home/ansible/bin/kill-succeeded-pods.py --verbose >> /home/ansible/kill-succeeded-pods.log 2>&1" + minute: "*/5" +- name: Copy kill-after-timeout-pods.py + ansible.builtin.copy: + src: files/cron/kill-after-timeout-pods.py + dest: /home/ansible/bin/kill-after-timeout-pods.py + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=r +- name: Add cron job to remove timed out pods + ansible.builtin.cron: + name: "remove timeout" + job: "python3 /home/ansible/bin/kill-after-timeout-pods.py --verbose >> /home/ansible/kill-after-timeout-pods.log 2>&1" + minute: "*/5" diff --git a/ansible/roles/k8s-pv/tasks/grafana.yml b/ansible/roles/k8s-pv/tasks/grafana.yml new file mode 100644 index 000000000..e983576ad --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/grafana.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for Grafana + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: grafana + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: grafana + spec: + capacity: + storage: "{{ GRAFANA_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/grafana + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: grafana + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml b/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml new file mode 100644 index 000000000..55bcab192 --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for JupyterHub + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "jupyterhub-db" + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: jupyterhub + spec: + capacity: + storage: "{{ JUPYTERHUB_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/jupyterhub + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: jupyterhub + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-pv/tasks/main.yml b/ansible/roles/k8s-pv/tasks/main.yml new file mode 100644 index 000000000..e83a385b4 --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/main.yml @@ -0,0 +1,22 @@ +- name: Create a Persistent Volume for Prometheus + kubernetes.core.k8s: + state: present + definition: + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: local-storage + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: mybinder + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer +- name: Provide Persistent Volume for Grafana + ansible.builtin.import_tasks: + file: grafana.yml +- name: Provide Persistent Volume for JupyterHub + ansible.builtin.import_tasks: + file: jupyter-hub-db.yml +- name: Provide Persistent Volume for Prometheus + ansible.builtin.import_tasks: + file: prometheus.yml diff --git a/ansible/roles/k8s-pv/tasks/prometheus.yml b/ansible/roles/k8s-pv/tasks/prometheus.yml new file mode 100644 index 000000000..ca4aa8a8a --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/prometheus.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for Prometheus + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: prometheus + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: prometheus + spec: + capacity: + storage: "{{ PROMETHEUS_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: prometheus + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-worker/tasks/main.yml b/ansible/roles/k8s-worker/tasks/main.yml new file mode 100644 index 000000000..7f1d6b455 --- /dev/null +++ b/ansible/roles/k8s-worker/tasks/main.yml @@ -0,0 +1,12 @@ +- name: Copy join command + ansible.builtin.copy: + src: '{{ ANSIBLE_CONTROL_NODE_TMP }}/{{ K8S_CONTROL_PLANE_ALIAS }}/tmp/kubernetes_join_command' + dest: /tmp/kubernetes_join_command + mode: u=rwx,g=rx,o=rx +- name: Attempt to join cluster + ansible.builtin.command: /tmp/kubernetes_join_command + register: kubernetes_join_attempt + failed_when: + - kubernetes_join_attempt.rc != 0 + - '"already exists" not in kubernetes_join_attempt.stderr' + changed_when: false diff --git a/ansible/roles/mybinder/tasks/main.yml b/ansible/roles/mybinder/tasks/main.yml new file mode 100644 index 000000000..720cb1e31 --- /dev/null +++ b/ansible/roles/mybinder/tasks/main.yml @@ -0,0 +1,22 @@ +- name: Add hub.jupyter.org/node-purpose label + ansible.builtin.shell: | + {% for host in hostvars %} + {% if host in groups['binderhub'] %} + kubectl label nodes {{ host }} hub.jupyter.org/node-purpose=core + {% else %} + kubectl label nodes {{ host }} hub.jupyter.org/node-purpose- + {% endif %} + {% endfor %} + changed_when: false +- name: Add labels from inventory + ansible.builtin.shell: | + {% for host in hostvars %} + {% for group, host_list in groups.items() %} + {% if host in host_list %} + kubectl label nodes {{ host }} {{ group }}=true + {% else %} + kubectl label nodes {{ host }} {{ group }}- + {% endif %} + {% endfor %} + {% endfor %} + changed_when: false diff --git a/ansible/vault/gesis-production.yml b/ansible/vault/gesis-production.yml new file mode 100644 index 000000000..0576d46f5 --- /dev/null +++ b/ansible/vault/gesis-production.yml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +32626233376562376639323233666538613863613765326261366535656434663931306235623132 +3561333630333337376461663662663165396630303962310a386331373832366237653436643836 +38666333643435393864666135303731663732343030336561656631663861303338613461343561 +3132653334336139610a336364343431376537316532626332646438656334646331663330646632 +33313237303330346462616562313564623732653435313365333166376162313061656131626536 +31356434663062626633616234393165323632376231656161303563633436396230363533643130 +62653435623037633461623134393132383833306563313938323338633232633363376466393064 +30636134346636616533333935663565336134303063646332633863626230616662643431656539 +30353664633961633263333435336232663538393431316662353336666365373066323066633131 +63613562663466343865306532333565363362386235643962343234613562303164303638623365 +63386635316531356238326364376334663934316661336537663561623664306133356134363661 +32353133333736613063363130303761363966653562613631623436333236366334303030303938 +63613238656662343037373932333933396538376565646434316530616461303032326263646161 +35353337343065343465666538346531633164623932393935316666326337303133613134373835 +33323561663337313230656136376561373665306161353338373333333134313464343266373365 +64333130393738656331666165383963613139613766363732306230393764623866653330373764 +6537 diff --git a/ansible/vault/gesis-stage.yml b/ansible/vault/gesis-stage.yml new file mode 100644 index 000000000..c6ce4e92c --- /dev/null +++ b/ansible/vault/gesis-stage.yml @@ -0,0 +1,22 @@ +$ANSIBLE_VAULT;1.1;AES256 +65666231316164316637653330376337383937373938613334343066376139326661643962376237 +3739366536353237356539656138383164326139333139390a333134313565323232646639313162 +61656433306461343266393566626465316239353933303136633034343231666337363838623563 +6633633234626132390a333632353730353066326438623663383634343532333539366363333334 +34646163313065393732306363353231633239313637646339623032626366626436346234376130 +66636432383138383838616434303931316334386665303563376336623930356638666366333561 +66353830353361343335623737653130383862353638393336303866323738303865623934303830 +66663164353837626636653766646233666164393564396233656665646636643862643035383733 +65376535346438623032316666333265643135653035373139626232646430623733383134656533 +34323737613565663536643430613832636666653030383066316632336363323734326339376162 +39343665393661623530303236353165656130396137373634363265346362623832653563613338 +31313261646333656362636134306162666133373334653933366531643063643537353663353932 +39386538626664393536363035646265643832303961323636653037356433346266353963666164 +32653334653936633130316463303061343938363630376663613639636338343331353732363837 +37616137373834333836393137333131643432653239313432623462616537353337303432393736 +34333463636566373330346437653037313366633762623161616564376639376561333561366530 +37356235373336303563373137393263626532356333666166396435346565333964316263393665 +32636239396563326635363636396435623731613364376632336261643064336530616235386631 +37336230323331323838326331303831616337363833616563306131393733666663303836636366 +38656336373763353836643536376239316463353862323332626661346366636236613530366464 +36363832656263633161303335613332396237353865643964626462653565386562 diff --git a/config/gesis-stage.yaml b/config/gesis-stage.yaml new file mode 100644 index 000000000..2270bb59e --- /dev/null +++ b/config/gesis-stage.yaml @@ -0,0 +1,167 @@ +analyticsPublisher: + enabled: false +binderhub: + config: + BinderHub: + base_url: /binder/ + build_node_selector: + binderhub: "true" + hub_url: https://notebooks-test.gesis.org/binder/jupyter/ + image_prefix: gesiscss/binder-r2d-g5b5b759- + template_path: /etc/binderhub/templates + use_registry: true + KubernetesBuildExecutor: + memory_limit: 3G + memory_request: 1G + node_selector: + binderhub: "true" + docker_available: "true" + LaunchQuota: + total_quota: 30 + extraConfig: + 01-template-variables: > + template_vars = { + "gesis_notebooks_https": 'https://notebooks-test.gesis.org/', + 'production': False, + } + + template_vars['gesis_notebooks_static'] = + template_vars['gesis_notebooks_https'] + "static/" + + template_vars['gesis_web_frontend_framework'] = + template_vars['gesis_notebooks_static'] + "gesis-web-frontend-framework/" + + template_vars['binder_static'] = template_vars['gesis_notebooks_https'] + + "binder/static/" + + c.BinderHub.template_variables.update(template_vars) + 02-badge-base-url: | + c.BinderHub.badge_base_url = "https://mybinder.org/" + extraEnv: + GOOGLE_APPLICATION_CREDENTIALS: /secrets/service-account.json + extraVolumeMounts: + - mountPath: /secrets + name: secrets + readOnly: true + extraVolumes: + - name: secrets + secret: + secretName: events-archiver-secrets + imageCleaner: + enabled: true + imageGCThresholdHigh: 80000000000 + imageGCThresholdLow: 50000000000 + imageGCThresholdType: absolute + ingress: + hosts: + - notebooks-test.gesis.org + jupyterhub: + hub: + baseUrl: /jupyterhub + db: + pvc: + storageClassName: local-storage + nodeSelector: + jupyterhub: "true" + singleuser: + nodeSelector: + jupyterhub_single_user: "true" + ingress: + hosts: + - notebooks-test.gesis.org + replicas: 1 +cryptnono: + enabled: true +grafana: + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - disableDeletion: true + editable: false + folder: notebooks.gesis.org + name: default + options: + path: /var/lib/grafana/dashboards/notebooks.gesis.org + orgId: 1 + type: file + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - editable: false + isDefault: true + name: GESIS Notebooks Prometheus + orgId: 1 + type: prometheus + uid: gesis-notebooks-prometheus + url: http://binderhub-prometheus-server + prune: true + deploymentStrategy: + type: Recreate + enabled: true + grafana.ini: + auth.anonymous: + enabled: true + org_name: Main Org. + org_role: Viewer + auth.basic: + enabled: true + security: + allow_embedding: true + server: + http_port: 3000 + root_url: https://notebooks.gesis.org/grafana/ + smtp: + enabled: true + ingress: + hosts: + - notebooks-test.gesis.org + path: /grafana + nodeSelector: + grafana: "true" + persistence: + enabled: false + resources: + limits: + cpu: "0.25" + memory: 128Mi + requests: + cpu: "0" + memory: 128Mi +ingress-nginx: + controller: + replicaCount: 1 + nodeSelector: + ingress: "true" + hostPort: + enable: true + scope: + enabled: true + service: + externalTrafficPolicy: null + type: ClusterIP +prometheus: + enabled: true + server: + ingress: + hosts: + - notebooks-test.gesis.org + path: /prometheus + livenessProbeInitialDelay: 800 + persistentVolume: + size: 10Gi + storageClass: local-storage + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi + retention: 30d +static: + ingress: + hosts: + - static.notebooks-test.gesis.org +url: https://notebooks-test.gesis.org/binder/ diff --git a/mybinder/templates/minesweeper/configmap.yaml b/mybinder/templates/minesweeper/configmap.yaml index a083cb0b7..41fd2a95f 100644 --- a/mybinder/templates/minesweeper/configmap.yaml +++ b/mybinder/templates/minesweeper/configmap.yaml @@ -1,4 +1,6 @@ -{{- /* configmap for minesweeper source files */ -}} +{{- /* +configmap for minesweeper source files +*/}} kind: ConfigMap apiVersion: v1 metadata: @@ -12,7 +14,9 @@ data: {{- (.Files.Glob "files/minesweeper/*").AsConfig | nindent 2 }} {{- (.Files.Glob "files/minesweeper/secrets/*").AsConfig | nindent 2 }} --- -{{- /* configmap for minesweeper configuration from values */ -}} +{{- /* +configmap for minesweeper configuration from values +*/}} kind: ConfigMap apiVersion: v1 metadata: diff --git a/secrets/config/common/gesis.yaml b/secrets/config/common/gesis.yaml new file mode 100644 index 000000000..a62d2ecc6 Binary files /dev/null and b/secrets/config/common/gesis.yaml differ diff --git a/secrets/config/gesis-stage.yaml b/secrets/config/gesis-stage.yaml new file mode 100644 index 000000000..7e5795906 Binary files /dev/null and b/secrets/config/gesis-stage.yaml differ