Skip to content

Commit

Permalink
Ability for nodes to be able to self bootstrap (#553)
Browse files Browse the repository at this point in the history
  • Loading branch information
alabdao authored Jul 31, 2023
1 parent 71e9608 commit 6e9f37f
Show file tree
Hide file tree
Showing 16 changed files with 256 additions and 26 deletions.
9 changes: 9 additions & 0 deletions infrastructure/ansible/install_requirements.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- name: Install requirements on the host
remote_user: ubuntu
# Ability to override host, useful to running playbook in local mode
hosts: "{{ target_hosts | default('all') }}"
tasks:
- name: Install collections and roles together
community.general.ansible_galaxy_install:
type: both
requirements_file: "{{ playbook_dir }}/requirements.yaml"
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_canary.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: Provision Canary
remote_user: ubuntu
hosts: tag_Type_compute
hosts: "{{ target_hosts | default('tag_Type_compute') }}"
vars:
canary_dir: /opt/local/canary
repo_dir: "{{ canary_dir }}/repo"
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_compute_instance.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: Provision Bacalhau
remote_user: ubuntu
hosts: tag_Type_compute:&tag_Env_prod
hosts: "{{ target_hosts | default('tag_Type_compute') }}"
vars:
nvidia_distribution: ubuntu2004
nvidia_container_toolkit_key_path: /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
Expand Down
7 changes: 4 additions & 3 deletions infrastructure/ansible/provision_compute_only.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
- name: Provision Bacalhau Compute Instance
remote_user: ubuntu
hosts: tag_Type_compute_only
# Ability to override host, useful to running playbook in local mode
hosts: "{{ target_hosts | default('tag_Type_compute_only') }}"
vars:
nvidia_distribution: ubuntu2004
ipfs_version: "0.18.0"
Expand Down Expand Up @@ -47,13 +48,13 @@
- name: flush handlers
ansible.builtin.meta: flush_handlers

# Required docker since IPFS runs in container
# Install IPFS
- name: Install IPFS
ansible.builtin.include_tasks: tasks/install_ipfs_tasks.yaml
tags: ipfs_install

# Run Bacalhau agent
- name: Run Baclahau container
- name: Run Baclahau agent
ansible.builtin.include_tasks: tasks/install_bacalhau_tasks.yaml
tags: bacalhau

Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_jupyter.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: Provision Jupyter Notebook Instances
remote_user: ubuntu
hosts: tag_Type_jupyter_notebook
hosts: "{{ target_hosts | default('tag_Type_jupyter_notebook') }}"
vars:
letsencrypt_email: "josh@labdao.xyz"
letsencrypt_domain: "jupyter.labdao.xyz"
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_receptor.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: Provision Receptor
remote_user: ubuntu
hosts: tag_Type_receptor
hosts: "{{ target_hosts | default('tag_Type_receptor') }}"
vars:
plex_dir: /opt/local/plex
receptor_dir: /opt/local/receptor
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/ansible/provision_requester.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: Provision Bacalhau Requester
remote_user: ubuntu
hosts: tag_Type_requester
hosts: "{{ target_hosts | default('tag_Type_requester') }}"
vars:
ipfs_version: "0.18.0"
ipfs_path: "/opt/ipfs"
Expand Down
49 changes: 43 additions & 6 deletions infrastructure/ansible/tasks/install_bacalhau_tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,53 @@
no_log: true
check_mode: false

- name: Set fact for currently installed version
ansible.builtin.set_fact:
bacalhau_installed_version: "{{ existing_bacalhau_version.stdout.split('Server Version: ')[1] }}"
when: existing_bacalhau_version.stdout != ''

- name: Print installed kubo version
ansible.builtin.debug:
msg: "Installed bacalhau version: {{ existing_bacalhau_version.stdout.split('Server Version: ')[1] }} vs {{ bacalhau_version }}"
when: existing_kubo_version.stdout != ''
msg: "Installed bacalhau version: {{ bacalhau_installed_version }} vs {{ bacalhau_version }}"
when: bacalhau_installed_version is defined

# Compare the latest version of bacalhau with the version that is already installed, if any.
- name: Install or update bacalhau
when:
"(existing_bacalhau_version.stdout == '') or (existing_bacalhau_version.stdout.split('Server Version: ')[1] != bacalhau_version)"
- name: Only do this if bacalhau isnt installed or upgrade is needed
when: bacalhau_installed_version is undefined or bacalhau_installed_version != bacalhau_version
block:
- name: Fetch AWS EC2 Metadata facts
amazon.aws.ec2_metadata_facts:

- name: Print environment info
ansible.builtin.debug:
msg: "Running on environment: {{ ansible_ec2_tags_instance_Env }}"
when: ansible_ec2_tags_instance_Env is defined

- name: Set fact when its prod node
ansible.builtin.set_fact:
bacalhau_hostname: "bacalhau.labdao.xyz"
requester_hostname: "requester.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower == "prod"

- name: Set fact when its non-prod node
ansible.builtin.set_fact:
bacalhau_hostname: "bacalhau.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
requester_hostname: "requester.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower != "prod"

# Bacalhau PeerID, example `curl -s bacalhau.staging.labdao.xyz:1234/node_info | jq -r '.PeerInfo.ID'`
- name: Determine requester bacalhau peer id
ansible.builtin.uri:
url: "http://{{ bacalhau_hostname }}:1234/node_info"
return_content: true
register: bacalhau_output
when: bacalhau_hostname is defined

- name: Set requester_peer url
ansible.builtin.set_fact:
# requester_peer: /dns4/requester.staging.labdao.xyz/tcp/1235/p2p/QmeLa2fx2FMNDWbeY3UqjELc1MbKwNxggcmdBmLZepY6VK
requester_peer: "/dns4/{{ requester_hostname }}/tcp/1235/p2p/{{ bacalhau_output.content | from_json | community.general.json_query('PeerInfo.ID') }}"
when: requester_hostname is defined

- name: Download Bacalhau binary
become: true
ansible.builtin.unarchive:
Expand Down
44 changes: 42 additions & 2 deletions infrastructure/ansible/tasks/install_ipfs_tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,17 @@
line: IPFS_PATH={{ ipfs_path }}

- name: Initiazlie IPFS
become: true
become_user: ubuntu
ansible.builtin.command:
cmd: ipfs init
creates: "{{ ipfs_path }}/config"
environment:
IPFS_PATH: "{{ ipfs_path }}"

- name: Configure IPFS
become: true
become_user: ubuntu
ansible.builtin.shell: |
ipfs config Addresses.API /ip4/0.0.0.0/tcp/5001
ipfs config Addresses.Gateway /ip4/0.0.0.0/tcp/8080
Expand All @@ -85,7 +89,6 @@
enabled: true

- name: Wait for IPFS to be healthy
become: true
command:
cmd: ipfs --api=/ip4/127.0.0.1/tcp/5001 dag stat /ipfs/QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn
register: ipfs_healthcheck
Expand All @@ -97,7 +100,44 @@
when: bacalhau_node_type == "compute"
tags: ipfs_swarm
block:
- name: Run ipfs swarm inside container
- name: Fetch AWS EC2 Metadata facts
amazon.aws.ec2_metadata_facts:

- name: Print environment info
ansible.builtin.debug:
msg: "Running on environment: {{ ansible_ec2_tags_instance_Env }}"
when: ansible_ec2_tags_instance_Env is defined

- name: Set fact when its prod node
ansible.builtin.set_fact:
bacalhau_hostname: "bacalhau.labdao.xyz"
requester_hostname: "requester.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower == "prod"

- name: Set fact when its non-prod node
ansible.builtin.set_fact:
bacalhau_hostname: "bacalhau.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
requester_hostname: "requester.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower != "prod"

# curl -s -X POST http://bacalhau.staging.labdao.xyz:5001/api/v0/config/show | jq -r '.Identity.PeerID'
- name: Determine ipfs peer id
ansible.builtin.uri:
url: "http://{{ bacalhau_hostname }}:5001/api/v0/config/show"
method: POST
return_content: true
register: ipfs_output
when: bacalhau_hostname is defined

- name: Set requester_peer url
ansible.builtin.set_fact:
# IPFS PeerID, `curl -s -X POST http://bacalhau.staging.labdao.xyz:5001/api/v0/config/show | jq -r '.Identity.PeerID'`
requester_ipfs_peer: "/dns4/{{ requester_hostname }}/tcp/4001/p2p/{{ ipfs_output.content | from_json | community.general.json_query('Identity.PeerID') }}"
when: requester_hostname is defined

- name: Run ipfs swarm
become: true
become_user: ubuntu
command:
cmd: ipfs swarm connect {{ requester_ipfs_peer }}
environment:
Expand Down
1 change: 1 addition & 0 deletions infrastructure/ansible/vars/prod.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
---

receptor_url: http://ip-172-31-82-127.ec2.internal:8080/judge
requester_peer: /ip4/172.31.90.74/tcp/1235/p2p/QmbETsVtL1sQ97KKV1jPQA5ng8RSyzPWUiDgRBQp7AcjRt
requester_ipfs_peer: /ip4/172.31.90.74/tcp/4001/p2p/12D3KooWAjYbsjXAQWqPRCPTTaMkDkUjhschznkLrDoKUyfQvHAP
6 changes: 0 additions & 6 deletions infrastructure/ansible/vars/staging.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
---

# Bacalhau PeerID, example `curl -s bacalhau.staging.labdao.xyz:1234/node_info | jq -r '.PeerInfo.ID'`
requester_peer: /dns4/requester.staging.labdao.xyz/tcp/1235/p2p/QmeLa2fx2FMNDWbeY3UqjELc1MbKwNxggcmdBmLZepY6VK

# IPFS PeerID, `curl -s -X POST http://bacalhau.staging.labdao.xyz:5001/api/v0/config/show | jq -r '.Identity.PeerID'`
requester_ipfs_peer: /dns4/requester.staging.labdao.xyz/tcp/4001/p2p/12D3KooWKmRk1TpoiHyXxEjsyjLqhWckkxdy5ybdSL6DGKAs1Ag2

nvidia_distribution: ubuntu2204
7 changes: 7 additions & 0 deletions infrastructure/terraform/jupyter.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ resource "aws_instance" "plex_jupyter" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 1000
}
Expand Down
44 changes: 44 additions & 0 deletions infrastructure/terraform/plex.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ resource "aws_instance" "plex_prod" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 1000
tags = {
Expand All @@ -27,6 +34,13 @@ resource "aws_instance" "plex_compute_prod" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 2000
tags = {
Expand All @@ -53,6 +67,13 @@ resource "aws_instance" "plex_compute_only" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 1000
tags = {
Expand All @@ -76,6 +97,13 @@ resource "aws_instance" "plex_requester" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 10
}
Expand Down Expand Up @@ -104,6 +132,15 @@ resource "cloudflare_record" "plex_compute_prod" {
ttl = 3600
}

# Private DNS record for requester
resource "cloudflare_record" "plex_compute_prod_private" {
zone_id = var.cloudflare_zone_id
name = "requester"
value = aws_eip.plex_prod.private_dns
type = "CNAME"
ttl = 3600
}

resource "aws_instance" "receptor" {
for_each = toset(["judgy"])
ami = "ami-053b0d53c279acc90"
Expand All @@ -113,6 +150,13 @@ resource "aws_instance" "receptor" {
key_name = var.key_main
availability_zone = var.availability_zones[0]

# Enabling metadata option with instance metadata tags - required for self bootstrapping
metadata_options {
http_endpoint = "enabled"
http_tokens = "optional"
instance_metadata_tags = "enabled"
}

root_block_device {
volume_size = 10
tags = {
Expand Down
35 changes: 35 additions & 0 deletions infrastructure/terraform/staging/asg.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ resource "aws_autoscaling_group" "labdao_compute_asg" {
}
}
}

}

# NOTE: autoscaling to stop instances at Friday 8pm EST
Expand Down Expand Up @@ -85,3 +86,37 @@ resource "aws_autoscaling_schedule" "labdao_compute_asg_schedule_1" {
# NOTE: Upping to 1
desired_capacity = 1
}

# NOTE: autoscaling to stop instances at Friday 8pm EST
resource "aws_autoscaling_schedule" "labdao_compute_asg_schedule_0" {
scheduled_action_name = "labdao-${var.environment}-compute-asg-count-0"
autoscaling_group_name = aws_autoscaling_group.labdao_compute_asg.name
recurrence = "00 20 * * FRI"
time_zone = "America/New_York"

# NOT Adjusting
min_size = -1

# NOT Adjusting
max_size = -1

# NOTE: Dropping to 0
desired_capacity = 0
}

# NOTE: autoscaling to start single instance on Monday 8am CEST
resource "aws_autoscaling_schedule" "labdao_compute_asg_schedule_1" {
scheduled_action_name = "labdao-${var.environment}-compute-asg-count-1"
autoscaling_group_name = aws_autoscaling_group.labdao_compute_asg.name
recurrence = "00 8 * * MON"
time_zone = "Europe/Berlin"

# NOT Adjusting
min_size = -1

# NOT Adjusting
max_size = -1

# NOTE: Upping to 1
desired_capacity = 1
}
Loading

0 comments on commit 6e9f37f

Please sign in to comment.