Skip to content

Commit

Permalink
GH-524 staging environment (#532)
Browse files Browse the repository at this point in the history
Co-authored-by: Humberto Evans <hevans66@gmail.com>
  • Loading branch information
alabdao and hevans66 authored Jul 21, 2023
1 parent 4bb5e6e commit 9538de2
Show file tree
Hide file tree
Showing 26 changed files with 711 additions and 94 deletions.
12 changes: 11 additions & 1 deletion infrastructure/ansible/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,20 @@ Ansible configuration consists of playbooks you run using `ansible-playbook [pla

We are using an aws plug in that provides dynamic ec2 inventory. Through configuration in `inventory.aws_ec2.yaml` we tell the dynamic inventory mechanism to group instances together based on AWS instance tags. Setting (for example) a Type tag on your aws instances of the same type (in Terraform) will allow you to target the instances you wish you perform tasks on.

# Environment

Use tags to limit targetting specific environment using `--limit tag_Env_staging` for Staging and `--limit tag_Env_prod` for prod.

Environment related vars file are available to be run under `vars/<env.yaml>`.

To pass in extra-vars do:

`ansible-playbook -e "@vars/staging.yaml" ............`

# Playbooks

* `provision_jupyter.yaml` Targets `jupyter_notebook` instances and installs [The Littlest Jupyter Hub](https://tljh.jupyter.org/). It does not do any configuration beyond installation.
* `set_jupyter_users.yaml` Sets the admins, users, and defines access permissions to a team folder
* `set_jupyter_users.yaml` Sets the admins, users, and defines access permissions to a team folder

# The teams.yaml

Expand Down
3 changes: 2 additions & 1 deletion infrastructure/ansible/files/requester.service
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ ExecStart=bacalhau serve \
{% endif %}
--labels owner={{ owner }} \
--job-selection-accept-networked \
--job-selection-data-locality anywhere
--job-selection-data-locality anywhere \
--peer none

[Install]
WantedBy=multi-user.target
34 changes: 34 additions & 0 deletions infrastructure/ansible/install_docker_tasks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
# Docker
- name: Add Docker GPG key
become: yes
ansible.builtin.get_url:
url: https://download.docker.com/linux/ubuntu/gpg
dest: /etc/apt/trusted.gpg.d/docker.asc

- name: Add Docker Repository
become: yes
ansible.builtin.apt_repository:
repo: deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable
state: present

- name: Create the docker group
become: yes
ansible.builtin.group:
name: docker

- name: Add ubuntu user to docker group
become: yes
ansible.builtin.user:
name: ubuntu
groups: docker

- name: Install docker
become: yes
ansible.builtin.apt:
pkg:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-compose-plugin
update_cache: true
2 changes: 2 additions & 0 deletions infrastructure/ansible/install_ipfs_tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
ipfs config Addresses.Gateway /ip4/0.0.0.0/tcp/8080
ipfs config --json API.HTTPHeaders.Access-Control-Allow-Methods '["PUT", "POST"]'
ipfs config Pinning.Recursive true
environment:
IPFS_PATH: "{{ ipfs_path }}"

- name: Install the IPFS systemd unit
become: yes
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/jupyter_deploy_plex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ansible.builtin.file:
path: "{{ plex_dir }}"
owner: ubuntu
group: ubuntu
group: ubuntu
state: directory

- name: Ensure all files in plex dir are owned by the user
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/jupyter_set_users.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
ansible.builtin.command:
cmd: tljh-config add-item users.admin {{ item }}
loop: "{{ admins | default([])}}"

- name: Create teams
include_tasks: jupyter_team_setup_tasks.yaml
loop: "{{ teams | default([])}}"
Expand Down
6 changes: 3 additions & 3 deletions infrastructure/ansible/jupyter_team_setup_tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
groups: "{{ item.team }}"
loop: "{{ item.users }}"
loop_control:
loop_var: user
loop_var: user

- name: Link shared folder into the users home directory
become: yes
Expand All @@ -49,10 +49,10 @@
owner: jupyter-{{ user }}
group: jupyter-{{ user }}
follow: false
state: link
state: link
loop: "{{ item.users }}"
loop_control:
loop_var: user
loop_var: user

- name: Ensure the symlink exists to the examples directory for every user
become: yes
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_canary.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
ansible.builtin.file:
path: "{{ item }}"
owner: ubuntu
group: ubuntu
group: ubuntu
state: directory
loop:
- "{{ repo_dir }}"
Expand Down
15 changes: 10 additions & 5 deletions infrastructure/ansible/provision_compute_instance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
environment:
IPFS_PATH: "{{ ipfs_path }}"
tasks:
# Must provide limit flag to ensure running against current environment
- fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true

# Aptitude is preferred by ansible
- name: Install aptitude
become: yes
Expand Down Expand Up @@ -50,13 +56,13 @@
become: yes
ansible.builtin.apt:
deb: /tmp/cuda-keyring.deb

- name: Get Nvidia Container Tookit GPG key
become: yes
ansible.builtin.shell:
cmd: curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --yes --dearmor -o {{ nvidia_container_toolkit_key_path }}
creates: "{{ nvidia_container_toolkit_key_path }}"

- name: Add Nvidia Container Tookit Repository
become: yes
ansible.builtin.apt_repository:
Expand Down Expand Up @@ -107,8 +113,8 @@
- name: Install IPFS
ansible.builtin.get_url:
url: https://dist.ipfs.tech/kubo/v0.18.0/kubo_v0.18.0_linux-amd64.tar.gz
dest: /tmp/ipfs.tar.gz
dest: /tmp/ipfs.tar.gz

- name: Make a folder to put IPFS files in
ansible.builtin.file:
path: /tmp/ipfs
Expand Down Expand Up @@ -182,7 +188,6 @@
vars:
owner: labdao
ipfs_connect: /ip4/127.0.0.1/tcp/5001
receptor_url: http://ip-172-31-82-127.ec2.internal:8080/judge
notify:
- Restart Bacalhau

Expand Down
119 changes: 41 additions & 78 deletions infrastructure/ansible/provision_compute_only.yaml
Original file line number Diff line number Diff line change
@@ -1,109 +1,72 @@
- name: Provision Bacalhau Compute Instance
remote_user: ubuntu
hosts: tag_Type_compute_only:&tag_Env_prod
hosts: tag_Type_compute_only
vars:
nvidia_distribution: ubuntu2004
nvidia_container_toolkit_key_path: /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
ipfs_path: /opt/local/ipfs
requester_peer: /ip4/172.31.90.74/tcp/1235/p2p/QmbETsVtL1sQ97KKV1jPQA5ng8RSyzPWUiDgRBQp7AcjRt
requester_ipfs_peer: /ip4/172.31.90.74/tcp/4001/p2p/12D3KooWAjYbsjXAQWqPRCPTTaMkDkUjhschznkLrDoKUyfQvHAP
gpu: true
environment:
IPFS_PATH: "{{ ipfs_path }}"
tasks:
# Must provide limit flag to ensure running against current environment
- fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true

# Aptitude is preferred by ansible
- name: Install aptitude
- name: Install aptitude and other required system packages
become: yes
ansible.builtin.apt:
name: aptitude
state: latest
update_cache: true
pkg:
- aptitude
- curl
- ca-certificates
- gnupg
- lsb-release

# Docker
- name: Add Docker GPG key
become: yes
ansible.builtin.get_url:
url: https://download.docker.com/linux/ubuntu/gpg
dest: /etc/apt/trusted.gpg.d/docker.asc
- name: Install Docker
ansible.builtin.import_tasks: install_docker_tasks.yaml

- name: Add Docker Repository
# Nvidia
- name: Add Nvidia Keyring
become: yes
ansible.builtin.apt_repository:
repo: deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable
state: present
ansible.builtin.apt:
deb: https://developer.download.nvidia.com/compute/cuda/repos/{{ nvidia_distribution }}/x86_64/cuda-keyring_1.1-1_all.deb

- name: Create the docker group
- name: Get Nvidia Container Tookit GPG key
become: yes
ansible.builtin.group:
name: docker
ansible.builtin.apt_key:
url: https://nvidia.github.io/libnvidia-container/gpgkey

- name: Add ubuntu user to docker group
- name: Add Nvidia Container Tookit Repository
become: yes
ansible.builtin.user:
name: ubuntu
groups: docker
ansible.builtin.apt_repository:
repo: deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/$(ARCH) /
state: present

- name: Install required system packages
- name: Install required system packages for gpu build
become: yes
ansible.builtin.apt:
pkg:
- ca-certificates
- curl
- gnupg
- lsb-release
- docker-ce
- docker-ce-cli
- containerd.io
- docker-compose-plugin
- cuda-drivers
state: latest
update_cache: true

# Nvidia
- name: Install Nvidia GPU drivers and packages
block:
- name: Get Nvidia drivers apt key
ansible.builtin.get_url:
url: https://developer.download.nvidia.com/compute/cuda/repos/{{ nvidia_distribution }}/x86_64/cuda-keyring_1.0-1_all.deb
dest: /tmp/cuda-keyring.deb

- name: Add Nvidia Keyring
become: yes
ansible.builtin.apt:
deb: /tmp/cuda-keyring.deb

- name: Get Nvidia Container Tookit GPG key
become: yes
ansible.builtin.shell:
cmd: curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --yes --dearmor -o {{ nvidia_container_toolkit_key_path }}
creates: "{{ nvidia_container_toolkit_key_path }}"

- name: Add Nvidia Container Tookit Repository
become: yes
ansible.builtin.apt_repository:
repo: deb [signed-by={{ nvidia_container_toolkit_key_path }}] https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/$(ARCH) /
state: present

- name: Install required system packages for gpu build
become: yes
ansible.builtin.apt:
pkg:
- cuda-drivers
state: latest
update_cache: true

- name: Install Nvidia Container Tookit
become: yes
ansible.builtin.apt:
pkg:
- nvidia-docker2
notify:
- Restart docker

- name: Ensure Nvidia persitence daemon is started
ansible.builtin.systemd:
name: nvidia-persistenced
when: gpu

- name: Install Nvidia Container Tookit
become: yes
ansible.builtin.apt:
pkg:
- nvidia-docker2
notify:
- Restart docker

- name: Ensure Nvidia persitence daemon is started
ansible.builtin.systemd:
name: nvidia-persistenced

- name: Install Golag
become: yes
vars:
Expand Down
8 changes: 7 additions & 1 deletion infrastructure/ansible/provision_jupyter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
letsencrypt_email: "josh@labdao.xyz"
letsencrypt_domain: "jupyter.labdao.xyz"
tasks:
# Must provide limit flag to ensure running against current environment
- fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true

# Aptitude is preferred by ansible
- name: Install aptitude
become: yes
Expand Down Expand Up @@ -89,5 +95,5 @@
ansible.builtin.command: tljh-config reload

- name: Bump system resources
become: yes
become: yes
ansible.builtin.command: sysctl -w net.core.rmem_max=2500000
8 changes: 7 additions & 1 deletion infrastructure/ansible/provision_receptor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
plex_dir: /opt/local/plex
receptor_dir: /opt/local/receptor
tasks:
# Must provide limit flag to ensure running against current environment
- fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true

- name: Install aptitude
become: yes
ansible.builtin.apt:
Expand All @@ -17,7 +23,7 @@
ansible.builtin.file:
path: "{{ item }}"
owner: ubuntu
group: ubuntu
group: ubuntu
state: directory
loop:
- "{{ plex_dir }}"
Expand Down
7 changes: 6 additions & 1 deletion infrastructure/ansible/provision_requester.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
vars:
ipfs_path: /opt/local/ipfs
tasks:
# Must provide limit flag to ensure running against current environment
- fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true

# Aptitude is preferred by ansible
- name: Install aptitude
become: yes
Expand Down Expand Up @@ -31,7 +37,6 @@
vars:
owner: labdao
ipfs_connect: /ip4/127.0.0.1/tcp/5001
receptor_url: http://ip-172-31-82-127.ec2.internal:8080/judge
notify:
- Restart Bacalhau

Expand Down
4 changes: 4 additions & 0 deletions infrastructure/ansible/vars/prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
receptor_url: http://ip-172-31-82-127.ec2.internal:8080/judge
requester_peer: /ip4/172.31.90.74/tcp/1235/p2p/QmbETsVtL1sQ97KKV1jPQA5ng8RSyzPWUiDgRBQp7AcjRt
requester_ipfs_peer: /ip4/172.31.90.74/tcp/4001/p2p/12D3KooWAjYbsjXAQWqPRCPTTaMkDkUjhschznkLrDoKUyfQvHAP
Loading

0 comments on commit 9538de2

Please sign in to comment.