Skip to content

Commit

Permalink
Add test to create PyTorchJob from func
Browse files Browse the repository at this point in the history
  • Loading branch information
andreyvelich committed Jan 8, 2024
1 parent ed3168a commit 4da6cfc
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/swift
echo "Disk usage after cleanup:"
df -h
Expand Down Expand Up @@ -109,7 +109,7 @@ jobs:
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python; pytest sdk/python/test --log-cli-level=info --namespace=default
python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
env:
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}

Expand Down
13 changes: 13 additions & 0 deletions sdk/python/kubeflow/training/api/training_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,19 @@ def __init__(self, kind) -> None:
{"job": create_job(), "namespace": "test"},
"success",
),
(
"valid flow to create job from func",
{
"name": "test-job",
"namespace": "test",
"train_func": lambda: print("Test Training Function"),
"base_image": "docker.io/test-training",
"num_worker_replicas": "3",
"packages_to_install": ["boto3==1.34.14"],
"pip_index_url": "https://pypi.custom.com/simple",
},
"success",
),
]


Expand Down
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_mpijob.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND)
JOB_NAME = "mpijob-mxnet-ci-test"
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_mxjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.MXJOB_KIND)
JOB_NAME = "mxjob-mnist-ci-test"
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_paddlejob.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.PADDLEJOB_KIND)
JOB_NAME = "paddlejob-cpu-ci-test"
Expand Down
41 changes: 40 additions & 1 deletion sdk/python/test/e2e/test_e2e_pytorchjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
JOB_NAME = "pytorchjob-mnist-ci-test"
Expand Down Expand Up @@ -161,6 +161,45 @@ def test_sdk_e2e(job_namespace):
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)


def test_sdk_e2e_create_from_func(job_namespace):
def train_func():
import time

for i in range(10):
print(f"Start training for Epoch {i}")
time.sleep(1)

num_workers = 1

TRAINING_CLIENT.create_job(
name=JOB_NAME,
namespace=job_namespace,
train_func=train_func,
num_worker_replicas=num_workers,
)

logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))

try:
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")

# Verify that PyTorchJob has correct pods.
pod_names = TRAINING_CLIENT.get_job_pod_names(
name=JOB_NAME, namespace=job_namespace
)

if len(pod_names) != num_workers or f"{JOB_NAME}-worker-0" not in pod_names:
raise Exception(f"PyTorchJob has incorrect pods: {pod_names}")

utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)


def generate_pytorchjob(
job_namespace: str,
master: KubeflowOrgV1ReplicaSpec,
Expand Down
3 changes: 2 additions & 1 deletion sdk/python/test/e2e/test_e2e_tfjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)


TRAINING_CLIENT = TrainingClient(job_kind=constants.TFJOB_KIND)
JOB_NAME = "tfjob-mnist-ci-test"
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/test/e2e/test_e2e_xgboostjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.XGBOOSTJOB_KIND)
JOB_NAME = "xgboostjob-iris-ci-test"
Expand Down

0 comments on commit 4da6cfc

Please sign in to comment.